# Master File Notebook

2/17 -- I created this notebook to apply all the basic data manipulations on the listings and calendar notebook in a separate place so a standard version of the 'Master' dataframe can be loaded from other notebooks rather than repeating the data transformation steps every time

This notebook's purpose is to output the 'master.csv' file in the main directory

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from pandas.plotting import scatter_matrix

In [2]:
os.listdir("Datasources/inside_airbnb") 

['neighbourhoods.geo.json',
 'reviews.csv',
 '.DS_Store',
 'neighbourhoods.geojson',
 'listings_summ.csv',
 'neighbourhoods.csv',
 'stations.geojson',
 'listings.csv',
 'calendar.csv',
 'reviews_summ.csv',
 'neighbourhoods.json']

In [3]:
listings = pd.read_csv('inside_airbnb/listings.csv')
#list_summ = pd.read_csv('inside_airbnb/listings_summ.csv')
#neighborhoods = pd.read_csv('inside_airbnb/neighbourhoods.csv')
#reviews = pd.read_csv('inside_airbnb/reviews.csv')
#reviews_summ = pd.read_csv('inside_airbnb/reviews_summ.csv')
calendar = pd.read_csv('inside_airbnb/calendar.csv')

In [4]:
listings = listings.loc[listings.room_type == 'Entire home/apt']

In [5]:
cal = calendar.copy()
cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
#Transform Price from Object to Numeric Data Type
cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [6]:
#Just using the calendar listingid, date and price to merge with listings
c = cal.loc[cal.available!='f']
c = c[['listing_id','date','price']]

In [7]:
master = listings.merge(c, how='inner', left_on='id', right_on='listing_id')

In [13]:
import pyarrow as pa
import pyarrow.parquet as pq

In [15]:
table = pa.Table.from_pandas(master)
pq.write_table(table, 'master.parquet')

In [11]:
#Trying Parquet
master.to_parquet('master.parquet', engine='fastparquet')

RuntimeError: Compression 'snappy' not available.  Options: ['GZIP', 'UNCOMPRESSED']

Modifications to master to build basic features on transit column requested by Sankarshan

In [10]:
lst = []
for i in master['transit']:
    lst.append(str(i))

In [13]:
master['transit_length'] = (pd.Series([len(i) for i in lst]))

In [15]:
def lexical_diversity(text):
    return len(text)/len(set(text))

In [17]:
master['transit_variety'] = pd.Series([lexical_diversity(i) for i in lst])

In [18]:
master['transit_vocab_size'] = pd.Series([len(set(i)) for i in lst])

In [25]:
import nltk
from nltk.corpus import stopwords

In [28]:
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content)/len(text)

In [29]:
#the percentage of non stopword words in every transit comment
master['transit_content_fraction'] = [content_fraction(i) for i in lst]

  This is separate from the ipykernel package so we can avoid doing imports until


KeyboardInterrupt: 

In [31]:
import re
import string

reviewTextLength = []
reviewTextWordsPerc = []
reviewTextPuncPerc = []
reviewTextDigitsPerc = []

for i in lst:
    tokens = re.findall(r"[\w']+|[.,!?;]", i)
    reviewTextLength.append(len(tokens))
    
    if len(tokens)==0:
        reviewTextWordsPerc.append(0)
        reviewTextPuncPerc.append(0)
        reviewTextDigitsPerc.append(0)

    else:
        reviewTextWordsPerc.append(len(i.split())/float(len(tokens)))
        reviewTextPuncPerc.append(len(''.join(c for c in i if c in string.punctuation))/float(len(tokens)))
        reviewTextDigitsPerc.append(len(''.join(c for c in i if c in string.digits))/float(len(tokens)))

master['transitTextLength'] = reviewTextLength
master['transitTextWordsPerc'] = reviewTextWordsPerc
master['transitTextPuncPerc'] = reviewTextPuncPerc
master['transitTextDigitsPerc'] = reviewTextDigitsPerc

In [32]:
master.to_parquet('master_transit.parquet', engine='fastparquet')