# Join and Merge Excercises
* Author: Owen Chen
* History: - 4/11/2022 started


In [62]:
import pandas as pd
import numpy as np

## Problem 1 - join two pandas dataframes, and compare various methods

* source:
    https://stackoverflow.com/questions/55807283/pandas-interview-question-compare-pandas-joins-and-ideally-provide-the-fastest
    

### The problem:
```
# Randomly generated historical data about how many megabytes were downloaded from the Internet."HoD" is the Hour of the Day!
hist_df = pd.DataFrame(columns=['HoD', 'Volume'])
hist_df['HoD'] = np.random.randint(0, 24, 365 * 24)
hist_df['Volume'] = np.random.uniform(1, 1000, 365 * 24)

# Tariffs based on the hour of the day
tariffs_df = pd.DataFrame({
    'Time range': ['00:00 to 09:00', '09:00 to 18:00', '18:00 to 00:00'],
    'cost': [10, 14, 22]
})
```

### Task: 
Return the historical dataframe with an additional column “cost” that will show how much money was spent for every hour in the historical data. Basically tariff dataframe need to be merged to the historical data.


In [63]:
# Randomly generated historical data about how many megabytes were downloaded from the Internet."HoD" is the Hour of the Day!
hist_df = pd.DataFrame(columns=['HoD', 'Volume'])
hist_df['HoD'] = np.random.randint(0, 24, 365 * 24)
hist_df['Volume'] = np.random.uniform(1, 1000, 365 * 24)

# Tariffs based on the hour of the day
tariffs_df = pd.DataFrame({
    'Time range': ['00:00 to 09:00', '09:00 to 18:00', '18:00 to 00:00'],
    'cost': [10, 14, 22]
})

### Solution 1 - use apply() to calculate the rate based on the hour

In [64]:
# Method 1 - use apply() to 

def solution1(hist_df, tariffs_df):
    # Create a start and end hours in tariffs_df
    def get_starthour(t):
        start = t.split(" to ")[0]
        timelist = start.split(":")
        hour = int(timelist[0]) + int(timelist[1])/60
        return hour
    def get_endhour(t):
        end = t.split(" to ")[1]
        timelist = end.split(":")
        hour = int(timelist[0]) + int(timelist[1])/60
        return hour

    # Get the cost rate and multiple rate to volumne
    def get_cost(t):
        for i in range(tariffs_df.shape[0]):
            if tariffs_df['timebucket_start'][i] <= t <= tariffs_df['timebucket_end'][i]:
                return tariffs_df['cost'][i]
        # Last bucket
        if t > tariffs_df['timebucket_start'][i]:
                return tariffs_df['cost'][i]
            
    tariffs_df['timebucket_start'] = tariffs_df['Time range'].apply(get_starthour)
    tariffs_df['timebucket_end'] = tariffs_df['Time range'].apply(get_endhour)
    hist_df['cost'] = hist_df['HoD'].apply(get_cost)
    hist_df['expense'] = hist_df['cost'] * hist_df['Volume']

    return hist_df

In [65]:
res = solution1(hist_df, tariffs_df)
res

Unnamed: 0,HoD,Volume,cost,expense
0,17,492.725176,14,6898.152470
1,3,496.764510,10,4967.645104
2,17,217.757222,14,3048.601115
3,15,367.389363,14,5143.451081
4,21,870.444263,22,19149.773791
...,...,...,...,...
8755,14,218.261628,14,3055.662794
8756,14,600.913826,14,8412.793567
8757,19,846.869556,22,18631.130225
8758,5,624.529945,10,6245.299450


In [66]:
# Clean input data before the second solution
hist_df.drop(['cost', 'expense'], axis=1, inplace=True)
tariffs_df.drop(['timebucket_start','timebucket_end'], axis=1, inplace=True)

In [67]:
hist_df

Unnamed: 0,HoD,Volume
0,17,492.725176
1,3,496.764510
2,17,217.757222
3,15,367.389363
4,21,870.444263
...,...,...
8755,14,218.261628
8756,14,600.913826
8757,19,846.869556
8758,5,624.529945


In [68]:
tariffs_df

Unnamed: 0,Time range,cost
0,00:00 to 09:00,10
1,09:00 to 18:00,14
2,18:00 to 00:00,22


In [69]:
# Method 2 - create a time range column to merge 

def solution2(hist_df, tariffs_df):
    # Create a start and end hours in tariffs_df

    def get_timebucket(timelist):
        timebucket=set()
        for t in timelist:            
            s = t.split(" to ")[0].split(":")
            hour = int(s[0]) + int(s[1])/60
            timebucket.add(hour)
            s = t.split(" to ")[1].split(":")
            hour = int(s[0]) + int(s[1])/60  
            timebucket.add(hour)
        return sorted(timebucket)

    def create_timerange(t):
        for i in range(1, len(timebucket)):
            if timebucket[i-1] <= t <=timebucket[i]:
                return tariffs_df['Time range'][i-1]
        #last bucket
        if t > timebucket[i]:
          return tariffs_df['Time range'][i]

    timebucket = get_timebucket(tariffs_df['Time range'])
    hist_df['Time range'] = hist_df['HoD'].apply(create_timerange)    
    res = pd.merge(hist_df, tariffs_df, how="left", on='Time range')
    res['expense'] = res['cost'] * res['Volume']
    return res

In [70]:
res = solution2(hist_df, tariffs_df)
res

Unnamed: 0,HoD,Volume,Time range,cost,expense
0,17,492.725176,09:00 to 18:00,14,6898.152470
1,3,496.764510,00:00 to 09:00,10,4967.645104
2,17,217.757222,09:00 to 18:00,14,3048.601115
3,15,367.389363,09:00 to 18:00,14,5143.451081
4,21,870.444263,18:00 to 00:00,22,19149.773791
...,...,...,...,...,...
8755,14,218.261628,09:00 to 18:00,14,3055.662794
8756,14,600.913826,09:00 to 18:00,14,8412.793567
8757,19,846.869556,18:00 to 00:00,22,18631.130225
8758,5,624.529945,00:00 to 09:00,10,6245.299450


In [76]:
%timeit res1=solution1(hist_df, tariffs_df)

228 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
# Clean input data before the second solution
hist_df.drop(['cost', 'expense'], axis=1, inplace=True)
tariffs_df.drop(['timebucket_start','timebucket_end'], axis=1, inplace=True)

%timeit res2=solution2(hist_df, tariffs_df)

52.7 ms ± 635 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
