## ML Experiment 1

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import psycopg2
import config as creds
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

In [2]:
# Import data from AWS databse

def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

conn, cursor=connect()

Connected!


In [3]:
# Read in the data from the database
listings_df = pd.read_sql_query("select * from nyc_listings", conn)
listings_df.head()

Unnamed: 0,id,name,host_id,host_name,borough,neighborhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review
0,77765,Superior @ Box House,417504,The Box House Hotel,Brooklyn,Greenpoint,40.73777,-73.95366,Hotel room,308,2,42,217,51
1,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64529,-73.97238,Private room,299,30,9,356,1419
2,45910,Beautiful Queens Brownstone! - 5BR,204539,Mark,Queens,Ridgewood,40.70309,-73.89963,Entire home/apt,425,30,13,365,1030
3,45936,Couldn't Be Closer To Columbia Uni,867225,Rahul,Manhattan,Morningside Heights,40.8063,-73.95985,Private room,75,31,135,219,58
4,80493,Cozy room in East Village with AC,434987,Jennifer,Manhattan,East Village,40.72322,-73.98615,Private room,55,2,207,132,25


In [4]:
# Drop unnecessary columns
listings_df=listings_df.drop(columns=['id', 'name', 'host_name', 'latitude', 'longitude'])
listings_df.head()

Unnamed: 0,host_id,borough,neighborhood,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review
0,417504,Brooklyn,Greenpoint,Hotel room,308,2,42,217,51
1,2787,Brooklyn,Kensington,Private room,299,30,9,356,1419
2,204539,Queens,Ridgewood,Entire home/apt,425,30,13,365,1030
3,867225,Manhattan,Morningside Heights,Private room,75,31,135,219,58
4,434987,Manhattan,East Village,Private room,55,2,207,132,25


In [5]:
# Create a new festure by combinining borough and neighborhood
listings_df['borough_neighborhood']=listings_df['borough']+' '+listings_df['neighborhood']
listings_df.head()

Unnamed: 0,host_id,borough,neighborhood,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,borough_neighborhood
0,417504,Brooklyn,Greenpoint,Hotel room,308,2,42,217,51,Brooklyn Greenpoint
1,2787,Brooklyn,Kensington,Private room,299,30,9,356,1419,Brooklyn Kensington
2,204539,Queens,Ridgewood,Entire home/apt,425,30,13,365,1030,Queens Ridgewood
3,867225,Manhattan,Morningside Heights,Private room,75,31,135,219,58,Manhattan Morningside Heights
4,434987,Manhattan,East Village,Private room,55,2,207,132,25,Manhattan East Village


In [6]:
# See if there is a relationship between number of listings
num_listings=listings_df.groupby('host_id').size()
num_listings.name='num_listings'

In [7]:
num_neighborhood=listings_df.groupby('borough_neighborhood').size()
num_neighborhood.name='num_neighborhood'

In [8]:
# Join the columns together
listings_df=listings_df.join(num_listings, on='host_id', how='outer').join(num_neighborhood, on='borough_neighborhood', how='outer')
listings_df.head()

Unnamed: 0,host_id,borough,neighborhood,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,borough_neighborhood,num_listings,num_neighborhood
0,417504,Brooklyn,Greenpoint,Hotel room,308,2,42,217,51,Brooklyn Greenpoint,30,573
10,417504,Brooklyn,Greenpoint,Hotel room,529,2,10,100,298,Brooklyn Greenpoint,30,573
11,417504,Brooklyn,Greenpoint,Private room,372,2,9,222,2,Brooklyn Greenpoint,30,573
13,417504,Brooklyn,Greenpoint,Private room,372,2,2,222,4136,Brooklyn Greenpoint,30,573
22,417504,Brooklyn,Greenpoint,Hotel room,308,2,22,221,88,Brooklyn Greenpoint,30,573


In [9]:
# Drop unnecessary columns
listings_df=listings_df.drop(columns=['host_id', 'borough', 'neighborhood'])
listings_df.head()

Unnamed: 0,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,borough_neighborhood,num_listings,num_neighborhood
0,Hotel room,308,2,42,217,51,Brooklyn Greenpoint,30,573
10,Hotel room,529,2,10,100,298,Brooklyn Greenpoint,30,573
11,Private room,372,2,9,222,2,Brooklyn Greenpoint,30,573
13,Private room,372,2,2,222,4136,Brooklyn Greenpoint,30,573
22,Hotel room,308,2,22,221,88,Brooklyn Greenpoint,30,573


In [10]:
# Use One-hot encoding for use in machine learning
cat_df=pd.get_dummies(listings_df, columns=['room_type', 'borough_neighborhood'])
cat_df.head()

Unnamed: 0,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_neighborhood,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,...,borough_neighborhood_Staten Island South Beach,borough_neighborhood_Staten Island St. George,borough_neighborhood_Staten Island Stapleton,borough_neighborhood_Staten Island Todt Hill,borough_neighborhood_Staten Island Tompkinsville,borough_neighborhood_Staten Island Tottenville,borough_neighborhood_Staten Island West Brighton,borough_neighborhood_Staten Island Westerleigh,borough_neighborhood_Staten Island Willowbrook,borough_neighborhood_Staten Island Woodrow
0,308,2,42,217,51,30,573,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,529,2,10,100,298,30,573,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11,372,2,9,222,2,30,573,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13,372,2,2,222,4136,30,573,0,0,1,...,0,0,0,0,0,0,0,0,0,0
22,308,2,22,221,88,30,573,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Format and assign variables
X=cat_df.drop(columns='price')
y=cat_df['price']

In [12]:
# Create and train the model
rf=RandomForestRegressor()
rf.fit(X, y)

RandomForestRegressor()

In [13]:
# Generate the model score
rf.score(X, y)

0.8760163058460437

In [14]:
# Sort the features and their importance
sorted(list(zip(rf.feature_importances_, X.columns)), reverse=True)

[(0.23943284617798002, 'days_since_last_review'),
 (0.23693279837388861, 'availability_365'),
 (0.12038652708516102, 'number_of_reviews'),
 (0.07692373214935036, 'minimum_nights'),
 (0.06953810407827861, 'num_listings'),
 (0.03160987146549974, 'borough_neighborhood_Manhattan East Harlem'),
 (0.02766481256085655, 'room_type_Private room'),
 (0.023163060496967734, 'borough_neighborhood_Manhattan Midtown'),
 (0.022254907540956752, 'room_type_Entire home/apt'),
 (0.01868654486161753, 'num_neighborhood'),
 (0.01373133908134987, 'room_type_Shared room'),
 (0.008133272442674741, 'borough_neighborhood_Bronx Longwood'),
 (0.00750049660410331, 'borough_neighborhood_Manhattan Lower East Side'),
 (0.007417377662151405, 'borough_neighborhood_Manhattan Financial District'),
 (0.007147015351696248, 'borough_neighborhood_Manhattan West Village'),
 (0.006585869982688554, 'borough_neighborhood_Manhattan Tribeca'),
 (0.006119927083417688, 'borough_neighborhood_Manhattan Upper East Side'),
 (0.00577381707

### Run the experiment again with some validation 

In [15]:
# Split into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [17]:
# Create and train the model
rf_valid=RandomForestRegressor()
rf_valid.fit(X_train, y_train)

RandomForestRegressor()

In [18]:
# Generate the scores
print(f'Train R2 Score: {rf_valid.score(X_train, y_train)}')
print(f'Test R2 Score: {rf_valid.score(X_test, y_test)}')

Train R2 Score: 0.8814627387382364
Test R2 Score: 0.22219212177459513


In [19]:
# Sort the features and their importance
sorted(list(zip(rf.feature_importances_, X.columns)), reverse=True)

[(0.23943284617798002, 'days_since_last_review'),
 (0.23693279837388861, 'availability_365'),
 (0.12038652708516102, 'number_of_reviews'),
 (0.07692373214935036, 'minimum_nights'),
 (0.06953810407827861, 'num_listings'),
 (0.03160987146549974, 'borough_neighborhood_Manhattan East Harlem'),
 (0.02766481256085655, 'room_type_Private room'),
 (0.023163060496967734, 'borough_neighborhood_Manhattan Midtown'),
 (0.022254907540956752, 'room_type_Entire home/apt'),
 (0.01868654486161753, 'num_neighborhood'),
 (0.01373133908134987, 'room_type_Shared room'),
 (0.008133272442674741, 'borough_neighborhood_Bronx Longwood'),
 (0.00750049660410331, 'borough_neighborhood_Manhattan Lower East Side'),
 (0.007417377662151405, 'borough_neighborhood_Manhattan Financial District'),
 (0.007147015351696248, 'borough_neighborhood_Manhattan West Village'),
 (0.006585869982688554, 'borough_neighborhood_Manhattan Tribeca'),
 (0.006119927083417688, 'borough_neighborhood_Manhattan Upper East Side'),
 (0.00577381707

### Run the experiment again with some scaling 

In [20]:
# Split into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [21]:
# Scale the data
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [22]:
# Create and train the model
rf_valid=RandomForestRegressor()
rf_valid.fit(X_train_scaled, y_train)

RandomForestRegressor()

In [23]:
# Generate the scores
print(f'Train R2 Score: {rf_valid.score(X_train_scaled, y_train)}')
print(f'Test R2 Score: {rf_valid.score(X_test_scaled, y_test)}')

Train R2 Score: 0.8878291970226384
Test R2 Score: 0.12810349754216255


In [24]:
# Sort the features and their importance
sorted(list(zip(rf.feature_importances_, X.columns)), reverse=True)

[(0.23943284617798002, 'days_since_last_review'),
 (0.23693279837388861, 'availability_365'),
 (0.12038652708516102, 'number_of_reviews'),
 (0.07692373214935036, 'minimum_nights'),
 (0.06953810407827861, 'num_listings'),
 (0.03160987146549974, 'borough_neighborhood_Manhattan East Harlem'),
 (0.02766481256085655, 'room_type_Private room'),
 (0.023163060496967734, 'borough_neighborhood_Manhattan Midtown'),
 (0.022254907540956752, 'room_type_Entire home/apt'),
 (0.01868654486161753, 'num_neighborhood'),
 (0.01373133908134987, 'room_type_Shared room'),
 (0.008133272442674741, 'borough_neighborhood_Bronx Longwood'),
 (0.00750049660410331, 'borough_neighborhood_Manhattan Lower East Side'),
 (0.007417377662151405, 'borough_neighborhood_Manhattan Financial District'),
 (0.007147015351696248, 'borough_neighborhood_Manhattan West Village'),
 (0.006585869982688554, 'borough_neighborhood_Manhattan Tribeca'),
 (0.006119927083417688, 'borough_neighborhood_Manhattan Upper East Side'),
 (0.00577381707