In [1]:
import pandas as pd
import numpy as np

In [2]:
RAW_DATA_PATH = '../raw_data'
DTYPES = {
    'realSum':                    'float64',
    'room_type':                  'category',  # Three possible categories: ['Entire home/apt', 'Private room', 'Shared room']
    'room_shared':                'bool',
    'room_private':               'bool',
    'person_capacity':            'int8',
    'host_is_superhost':          'bool',
    'multi':                      'bool',  # Multi seems to be a boolean variable.
    'biz':                        'bool',  # biz seems to be a boolean variable.
    'cleanliness_rating':         'int8',
    'guest_satisfaction_overall': 'int16',
    'bedrooms':                   'int8',
    'dist':                       'float64',
    'metro_dist':                 'float64',
    'attr_index':                 'float64',
    'attr_index_norm':            'float64',
    'rest_index':                 'float64',
    'rest_index_norm':            'float64',
    'lng':                        'float64',
    'lat':                        'float64'
}

In [3]:
def _read_csv(path: str) -> pd.DataFrame:
    """
    Helper function to read csv's, set their index properly and convert the datatypes here accordingly.
    Many times the automatic detection was using Int64 for variables with a small range of values (like `person_capacity`),
    using integeres instead of booleans (for `multi` and `biz`) or not using the categorical datatype (`room_type`).
    """
    return pd.read_csv(
        path,
        index_col=0,
        dtype=DTYPES
    )


def _read_city(city: str) -> pd.DataFrame:
    """
    Helper function to generate a merged dataframe for a specified @city.
    It combines the weekdays and the weekends data into a single dataframe.
    """
    assert city in ['amsterdam', 'paris'], "ERROR: Invalid city. Options are 'amsterdam' or 'paris'"
    
    weekday_path = f"{RAW_DATA_PATH}/{city}_weekdays.csv"
    weekday = _read_csv(weekday_path)
    weekday['is_weekend'] = False

    weekend_path = f"{RAW_DATA_PATH}/{city}_weekends.csv"
    weekend = _read_csv(weekend_path)
    weekend['is_weekend'] = True

    merged = pd.concat([weekday, weekend])
    merged['city'] = city

    return merged.reset_index(drop=True)


def get_merged_df() -> pd.DataFrame:
    """
    Reads the data from both files (weekends and weekdays) and both cities (Amsterdam and Paris) and merge them into a single dataframe.
    It also corrects the datatypes and sets the index properly.
    """
    amsterdam = _read_city('amsterdam')
    paris = _read_city('paris')
    
    merged = pd.concat([amsterdam, paris])
    merged = merged.reset_index(drop=True)
    
    return merged

Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,is_weekend,city
0,194.033698,Private room,False,True,2,False,True,False,10,93,...,5.022964,2.539380,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,False,amsterdam
1,344.245776,Private room,False,True,4,False,False,False,8,85,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,False,amsterdam
2,264.101422,Private room,False,True,2,False,False,True,9,87,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,False,amsterdam
3,433.529398,Private room,False,True,4,False,False,True,9,90,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,False,amsterdam
4,485.552926,Private room,False,True,2,True,False,False,10,98,...,0.544738,0.318693,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,False,amsterdam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8763,295.460900,Entire home/apt,False,False,4,False,False,False,9,80,...,3.660389,0.168146,209.752453,10.199228,540.326583,24.926525,2.38051,48.88393,True,paris
8764,232.081275,Entire home/apt,False,False,4,False,False,False,10,98,...,3.558813,0.154703,185.486701,9.019304,474.351813,21.882955,2.40050,48.85093,True,paris
8765,223.925809,Entire home/apt,False,False,2,False,True,False,9,89,...,4.205205,0.253029,172.658919,8.395552,406.585935,18.756757,2.40100,48.87700,True,paris
8766,200.857489,Entire home/apt,False,False,2,True,False,False,9,93,...,2.891214,0.240674,235.167925,11.435057,602.451672,27.792500,2.38200,48.87400,True,paris


In [6]:
df = get_merged_df()
df

Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,is_weekend,city
0,194.033698,Private room,False,True,2,False,True,False,10,93,...,5.022964,2.539380,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,False,amsterdam
1,344.245776,Private room,False,True,4,False,False,False,8,85,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,False,amsterdam
2,264.101422,Private room,False,True,2,False,False,True,9,87,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,False,amsterdam
3,433.529398,Private room,False,True,4,False,False,True,9,90,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,False,amsterdam
4,485.552926,Private room,False,True,2,True,False,False,10,98,...,0.544738,0.318693,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,False,amsterdam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8763,295.460900,Entire home/apt,False,False,4,False,False,False,9,80,...,3.660389,0.168146,209.752453,10.199228,540.326583,24.926525,2.38051,48.88393,True,paris
8764,232.081275,Entire home/apt,False,False,4,False,False,False,10,98,...,3.558813,0.154703,185.486701,9.019304,474.351813,21.882955,2.40050,48.85093,True,paris
8765,223.925809,Entire home/apt,False,False,2,False,True,False,9,89,...,4.205205,0.253029,172.658919,8.395552,406.585935,18.756757,2.40100,48.87700,True,paris
8766,200.857489,Entire home/apt,False,False,2,True,False,False,9,93,...,2.891214,0.240674,235.167925,11.435057,602.451672,27.792500,2.38200,48.87400,True,paris
