# Merging dataframe using with key being between 2 dates

## links / sources
- https://stackoverflow.com/questions/30627968/merge-pandas-dataframes-where-one-value-is-between-two-others

In [1]:
import pandas as pd

In [50]:
vacations = pd.DataFrame(dict(
  vacation_name=("Summer", "Spring"), 
  _from=pd.to_datetime(["2022-06-05", "2022-04-01"]),
  _to=pd.to_datetime(["2022-07-03", "2022-04-08"])
))

In [64]:
events = pd.DataFrame(dict(
  event_name=("Park", "Hike", "waterski", "Flower watching", "forest walk"),
  date=pd.to_datetime(["2022-06-01", "2022-06-05", "2022-07-01", "2022-03-28", "2022-04-05"]),
))

In [30]:
vacations

Unnamed: 0_level_0,vacation_name,_from,_to
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"[2022-06-05, 2022-07-03]",Summer,2022-06-05,2022-07-03
"[2022-04-01, 2022-04-08]",Spring,2022-04-01,2022-04-08


In [33]:
events

Unnamed: 0_level_0,event_name
date,Unnamed: 1_level_1
2022-06-01,Park
2022-06-05,Hike
2022-07-01,waterski
2022-03-28,Flower watching
2022-04-05,forest walk


## Method 1: Create `interval index`

In [52]:
vacations['interval'] = pd.IntervalIndex.from_arrays(vacations._from, vacations._to, closed='both')
vacations = vacations.set_index('interval')

In [62]:
# Non-mutating, left join only a single row

events.assign(vacation_name = vacations['vacation_name'])

Unnamed: 0_level_0,event_name,vacation_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-06-01,Park,
2022-06-05,Hike,Summer
2022-07-01,waterski,Summer
2022-03-28,Flower watching,
2022-04-05,forest walk,Spring


In [65]:
def left_join_interval(left: pd.DataFrame, right: pd.DataFrame, _from: str, _to: str, _on: str = None):
    """
    left:  pandas dataframe
    right: pandas dataframe
    _from: String. Name of the column containing the from dates.
    _to:   String. Name of the column containing the to dates.
    _on:   Name of the column containing the dates in the left dataframe that will be joined on. If `None`, use the index.
    """
    
    if _on is None:
        left = left.copy()
    else:
        left = left.set_index(_on)
    
    right = right.set_index(
        pd.IntervalIndex.from_arrays(right[_from], right[_to], closed='both')
    )
    right_cols = right.columns
    left[right_cols] = right[right_cols]
    
    return left
    

### Method 1.2: Mutating - overwrite rows in left dataframe

In [58]:
# Mutating, left join all rows

cols = vacations.columns
events[cols] = vacations[cols]
events

Unnamed: 0_level_0,event_name,vacation_name,_from,_to
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-01,Park,,NaT,NaT
2022-06-05,Hike,Summer,2022-06-05,2022-07-03
2022-07-01,waterski,Summer,2022-06-05,2022-07-03
2022-03-28,Flower watching,,NaT,NaT
2022-04-05,forest walk,Spring,2022-04-01,2022-04-08


### Method 1.2: Using join

In [60]:
events.join(vacations)

Unnamed: 0_level_0,event_name,vacation_name,_from,_to
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-01 00:00:00,Park,,NaT,NaT
2022-06-05 00:00:00,Hike,,NaT,NaT
2022-07-01 00:00:00,waterski,,NaT,NaT
2022-03-28 00:00:00,Flower watching,,NaT,NaT
2022-04-05 00:00:00,forest walk,,NaT,NaT


Unnamed: 0_level_0,event_name,vacation_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-06-01,Park,
2022-06-05,Hike,Summer
2022-07-01,waterski,Summer
2022-03-28,Flower watching,
2022-04-05,forest walk,Spring


## Method 2, using sql