# This notebook will serve for the merging of datasets for the TFW project

In [None]:
# Import necassary libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

ROOT = os.environ.get('PWD')


# the outcome of the merging will be a dataset with the name, df_master_tfw6

# we will start with the inquiries dataset as the source of the instances of inquiries we will use for our model. to this file we will add data related to each property that have collected from the listings and room features files

In [None]:

# read in inquiries dataset

df_inquiries = pd.read_csv('../data/master_inquiries_20210715.csv', low_memory=False)


In [None]:

df_inquiries.head(3)


In [None]:
df_inquiries.info()

In [None]:
df_inquiries.nunique()

In [None]:
del df_inquiries['Unnamed: 0']

# read in listings dataset. note that listings has the master list of properties, which will reduce the number of observations in inquiries and room features. we need to merge appropriately



In [None]:
# read in listings dataset

df_listings = pd.read_csv('../data/master_listings_20210723.csv', low_memory=False)


In [None]:
del df_listings['Unnamed: 0']

In [None]:
df_listings.drop(['customer_id','state','subscription'],axis=1,inplace=True)

In [None]:
df_listings.nunique()

In [None]:
# do the first actual merge

df_master_tfw1 = pd.merge(df_inquiries, df_listings, left_on='listing_id', right_on='listing_id', how='right')

In [None]:
# check to see what the outcome looks like in df_master_tfw 

df_master_tfw1.nunique()

In [None]:
df_master_tfw1.info()

# merge all of room features dataset into master

In [None]:
# load room features dataset
df_room_features = pd.read_csv('../data/room_features20210719.csv')

In [None]:
df_room_features.head()

In [None]:
df_master_tfw2 = pd.merge(df_room_features, df_master_tfw1, left_on='listing_id', right_on='listing_id', how='right')

In [None]:
df_master_tfw2.info()

In [None]:
df_master_tfw2.nunique()

In [None]:
del df_master_tfw2['time']

In [None]:
# set global default to be able to see all columns

pd.set_option('display.max_rows', None)

In [None]:
df_master_tfw2.isna().sum()

In [None]:
x = list(df_master_tfw2.columns.values)

In [None]:
y = x[1:145]

In [None]:
y

In [None]:
# filling in missing values for all the columns in the list 
for i in y:
    df_master_tfw2[i].fillna(0,inplace=True)

In [None]:
df_master_tfw2.isna().sum()

In [None]:
# imputing missing values for inquiry price, using median

df_master_tfw2['inquiry_price'] = df_master_tfw2['inquiry_price'].fillna(df_master_tfw2['inquiry_price'].median())

In [None]:
# imputing missing values for adult count, using median

df_master_tfw2['adult_count'] = df_master_tfw2['adult_count'].fillna(df_master_tfw2['adult_count'].median())

In [None]:
# imputing missing values for children count, using median

df_master_tfw2['children_count'] = df_master_tfw2['children_count'].fillna(df_master_tfw2['children_count'].median())

In [None]:
# imputing missing values for pets, using median

df_master_tfw2['pets_count'] = df_master_tfw2['pets_count'].fillna(df_master_tfw2['pets_count'].median())

In [None]:
# delete the 655 rows that just shouldn't be there

df_master_tfw2.dropna(inplace=True)

In [None]:
df_master_tfw2.isna().sum()

# now merge with view data per house, ka-boom!

# read in stats data



In [None]:
df_stats_extract = pd.read_csv('../data/statistics_per_id_20210719.csv')

In [None]:
df_stats_extract.nunique()

In [None]:
df_stats_extract.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
df_master_tfw4 = pd.merge(df_stats_extract, df_master_tfw2, left_on='listing_id', right_on='listing_id', how='right')

In [None]:
df_master_tfw4.listing_id.nunique()

In [None]:
# read in relevant prices

df_new_price = pd.read_csv('../data/prices_for_master.csv')

In [None]:
df_new_price.head(10)

In [None]:
del df_new_price['Unnamed: 0']

In [None]:
# we need to filter out price data that is not in our data range. create year column to do the filter

df_new_price['year'] = pd.to_datetime(df_new_price['date_from']).dt.year

In [None]:
# drop prices that are outside of our data range

filter_price = df_new_price.query('year > 2018 & year < 2021')

In [None]:
filter_price.head(100)

In [None]:
filter_price.nunique()

# check shows only 2019 and 2020 are in the data


In [None]:
# drop inquiry_price and integrate proper price data into the merged data set

df_master_tfw4.drop(['inquiry_price','length_stay'],axis=1,inplace=True)

In [None]:
df_master_tfw5 = pd.merge(df_master_tfw4, filter_price, on='listing_id', how='outer')

In [None]:
# save to csv

df_master_tfw5.to_csv('../data/master_5_temp.csv')



In [None]:

# checking
 
df_master_tfw5.head()

In [None]:
df_master_tfw5.shape

In [None]:
# get rid of rows with missing price data

df_master_tfw5.dropna(inplace=True)

In [None]:
df_master_tfw5.shape

In [None]:
# extract price only on matches for arrival date in appropriate range

df_master_tfw6 = df_master_tfw5[(df_master_tfw5.arrival_date >= df_master_tfw5.date_from) & (df_master_tfw5.arrival_date < df_master_tfw5.date_to)]


In [None]:
# checking

df_master_tfw6.shape

In [None]:
df_master_tfw6.head(20)

In [None]:
list(df_filter_price_match.columns.values)

In [None]:
# check for missing values

df_master_tfw6.isna().sum()

# data set has no missing values

In [None]:
# checking merged set, columsn look ok

df_filter_price_match.nunique()

In [None]:
# save merged set

df_filter_price_match.to_csv('../data/newest_tiny_master_1.csv')

In [None]:
# version to save with missing values, for possible later analysis

df_master_tfw3 = pd.merge(df_room_features, df_master_tfw1, left_on='listing_id', right_on='listing_id', how='right')

In [None]:
df_master_tfw3 = pd.merge(df_stats_extract, df_master_tfw3, left_on='listing_id', right_on='listing_id', how='right')

In [None]:
df_master_tfw3.head()

In [None]:
df_master_tfw3.to_csv('../data/master_with_missing_20210719.csv')