In [6]:
# Import dependencies
import pandas as pd
import numpy as np
import psycopg2
import config as creds
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

In [7]:
# Import data from AWS databse

def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

conn, cursor=connect()

Connected!


In [8]:
# Read in the data from the database
listings_df = pd.read_sql_query("select * from nyc_listings", conn)
listings_df

Unnamed: 0,id,name,host_id,host_name,borough,neighborhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review
0,77765,Superior @ Box House,417504,The Box House Hotel,Brooklyn,Greenpoint,40.737770,-73.953660,Hotel room,308,2,42,217,51
1,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.645290,-73.972380,Private room,299,30,9,356,1419
2,45910,Beautiful Queens Brownstone! - 5BR,204539,Mark,Queens,Ridgewood,40.703090,-73.899630,Entire home/apt,425,30,13,365,1030
3,45936,Couldn't Be Closer To Columbia Uni,867225,Rahul,Manhattan,Morningside Heights,40.806300,-73.959850,Private room,75,31,135,219,58
4,80493,Cozy room in East Village with AC,434987,Jennifer,Manhattan,East Village,40.723220,-73.986150,Private room,55,2,207,132,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31500,628769808856889664,Two bedroom apartment in Hoboken New Jersey.,14468718,Burak,Manhattan,West Village,40.740227,-74.027504,Entire home/apt,410,3,2,0,40
31501,27577588,Luxury Studio ON Grove Street E0C - B1CA,37412692,Kim,Manhattan,Ellis Island,40.718220,-74.037940,Entire home/apt,135,365,2,365,1087
31502,654151117629853651,Lovely 3- bedroom apartment,117540494,Miriam,Queens,Rosedale,40.647244,-73.720088,Entire home/apt,180,1,5,0,14
31503,553754115911961053,Trendy 3-bedroom apartment near Manhattan,15048320,India,Manhattan,Upper West Side,40.787320,-74.004470,Entire home/apt,240,5,18,152,16


In [9]:
listings_df=listings_df.drop(columns=['id', 'name', 'host_name', 'latitude', 'longitude'])

In [10]:
listings_df['borough_neighborhood']=listings_df['borough']+' '+listings_df['neighborhood']
listings_df.head()

Unnamed: 0,host_id,borough,neighborhood,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,borough_neighborhood
0,417504,Brooklyn,Greenpoint,Hotel room,308,2,42,217,51,Brooklyn Greenpoint
1,2787,Brooklyn,Kensington,Private room,299,30,9,356,1419,Brooklyn Kensington
2,204539,Queens,Ridgewood,Entire home/apt,425,30,13,365,1030,Queens Ridgewood
3,867225,Manhattan,Morningside Heights,Private room,75,31,135,219,58,Manhattan Morningside Heights
4,434987,Manhattan,East Village,Private room,55,2,207,132,25,Manhattan East Village


In [18]:
num_listings=listings_df.groupby('host_id').size()
num_listings.name='num_listings'

In [20]:
num_neighborhood=listings_df.groupby('borough_neighborhood').size()
num_neighborhood.name='num_neighborhood'

In [22]:
listings_df=listings_df.join(num_listings, on='host_id', how='outer').join(num_neighborhood, on='borough_neighborhood', how='outer')
listings_df.head()

Unnamed: 0,host_id,borough,neighborhood,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,borough_neighborhood,num_listings,num_neighborhood
0,417504,Brooklyn,Greenpoint,Hotel room,308,2,42,217,51,Brooklyn Greenpoint,30,573
10,417504,Brooklyn,Greenpoint,Hotel room,529,2,10,100,298,Brooklyn Greenpoint,30,573
11,417504,Brooklyn,Greenpoint,Private room,372,2,9,222,2,Brooklyn Greenpoint,30,573
13,417504,Brooklyn,Greenpoint,Private room,372,2,2,222,4136,Brooklyn Greenpoint,30,573
22,417504,Brooklyn,Greenpoint,Hotel room,308,2,22,221,88,Brooklyn Greenpoint,30,573


In [23]:
listings_df=listings_df.drop(columns=['host_id', 'borough', 'neighborhood'])
listings_df.head()

Unnamed: 0,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,borough_neighborhood,num_listings,num_neighborhood
0,Hotel room,308,2,42,217,51,Brooklyn Greenpoint,30,573
10,Hotel room,529,2,10,100,298,Brooklyn Greenpoint,30,573
11,Private room,372,2,9,222,2,Brooklyn Greenpoint,30,573
13,Private room,372,2,2,222,4136,Brooklyn Greenpoint,30,573
22,Hotel room,308,2,22,221,88,Brooklyn Greenpoint,30,573


In [28]:
cat_df=pd.get_dummies(listings_df, columns=['room_type', 'borough_neighborhood'])
cat_df.head()

Unnamed: 0,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_neighborhood,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,...,borough_neighborhood_Staten Island South Beach,borough_neighborhood_Staten Island St. George,borough_neighborhood_Staten Island Stapleton,borough_neighborhood_Staten Island Todt Hill,borough_neighborhood_Staten Island Tompkinsville,borough_neighborhood_Staten Island Tottenville,borough_neighborhood_Staten Island West Brighton,borough_neighborhood_Staten Island Westerleigh,borough_neighborhood_Staten Island Willowbrook,borough_neighborhood_Staten Island Woodrow
0,308,2,42,217,51,30,573,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,529,2,10,100,298,30,573,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11,372,2,9,222,2,30,573,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13,372,2,2,222,4136,30,573,0,0,1,...,0,0,0,0,0,0,0,0,0,0
22,308,2,22,221,88,30,573,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
X=cat_df.drop(columns='price')
y=cat_df['price']

In [31]:
rf=RandomForestRegressor()

In [32]:
rf.fit(X, y)

  """Entry point for launching an IPython kernel.


RandomForestRegressor()

In [46]:
rf.score(X, y)

0.874696522364114

In [47]:
sorted(list(zip(rf.feature_importances_, X.columns)), reverse=True)

[(0.24948137828501626, 'days_since_last_review'),
 (0.22272091360919782, 'availability_365'),
 (0.11598679616518762, 'number_of_reviews'),
 (0.08289146736035088, 'minimum_nights'),
 (0.07238321336763583, 'num_listings'),
 (0.03355004796926131, 'borough_neighborhood_Manhattan East Harlem'),
 (0.025864932302527885, 'room_type_Private room'),
 (0.02291730369159598, 'room_type_Entire home/apt'),
 (0.02134009141377597, 'borough_neighborhood_Manhattan Midtown'),
 (0.020532435514123395, 'num_neighborhood'),
 (0.010936555361332772, 'room_type_Shared room'),
 (0.010040869748795919, 'borough_neighborhood_Manhattan Lower East Side'),
 (0.008004807389148128, 'borough_neighborhood_Manhattan West Village'),
 (0.007218898558009118, 'borough_neighborhood_Bronx Longwood'),
 (0.006835692717616953, 'borough_neighborhood_Manhattan Financial District'),
 (0.006316165642422554, 'borough_neighborhood_Manhattan Upper West Side'),
 (0.005970515190499674, 'borough_neighborhood_Manhattan Tribeca'),
 (0.005331772

In [48]:
rf_valid=RandomForestRegressor()

In [53]:
from sklearn.preprocessing import StandardScaler

In [55]:
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [56]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [58]:
rf_valid.fit(X_train_scaled, y_train)

RandomForestRegressor()

In [59]:
print(f'Train R2 Score: {rf_valid.score(X_train_scaled, y_train)}')
print(f'Test R2 Score: {rf_valid.score(X_test_scaled, y_test)}')

Train R2 Score: 0.8750190809938987
Test R2 Score: 0.16306812572348783


In [52]:
sorted(list(zip(rf.feature_importances_, X.columns)), reverse=True)

[(0.24948137828501626, 'days_since_last_review'),
 (0.22272091360919782, 'availability_365'),
 (0.11598679616518762, 'number_of_reviews'),
 (0.08289146736035088, 'minimum_nights'),
 (0.07238321336763583, 'num_listings'),
 (0.03355004796926131, 'borough_neighborhood_Manhattan East Harlem'),
 (0.025864932302527885, 'room_type_Private room'),
 (0.02291730369159598, 'room_type_Entire home/apt'),
 (0.02134009141377597, 'borough_neighborhood_Manhattan Midtown'),
 (0.020532435514123395, 'num_neighborhood'),
 (0.010936555361332772, 'room_type_Shared room'),
 (0.010040869748795919, 'borough_neighborhood_Manhattan Lower East Side'),
 (0.008004807389148128, 'borough_neighborhood_Manhattan West Village'),
 (0.007218898558009118, 'borough_neighborhood_Bronx Longwood'),
 (0.006835692717616953, 'borough_neighborhood_Manhattan Financial District'),
 (0.006316165642422554, 'borough_neighborhood_Manhattan Upper West Side'),
 (0.005970515190499674, 'borough_neighborhood_Manhattan Tribeca'),
 (0.005331772