## ML Experiment 3

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import psycopg2
import config as creds
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

In [2]:
# Import data from AWS databse

def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

conn, cursor=connect()

Connected!


In [3]:
# Read in the data from the database
listings_df = pd.read_sql_query("select * from nyc_listings", conn)
listings_df.head()

Unnamed: 0,id,name,host_id,host_name,borough,neighborhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review
0,77765,Superior @ Box House,417504,The Box House Hotel,Brooklyn,Greenpoint,40.73777,-73.95366,Hotel room,308,2,42,217,51
1,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64529,-73.97238,Private room,299,30,9,356,1419
2,45910,Beautiful Queens Brownstone! - 5BR,204539,Mark,Queens,Ridgewood,40.70309,-73.89963,Entire home/apt,425,30,13,365,1030
3,45936,Couldn't Be Closer To Columbia Uni,867225,Rahul,Manhattan,Morningside Heights,40.8063,-73.95985,Private room,75,31,135,219,58
4,80493,Cozy room in East Village with AC,434987,Jennifer,Manhattan,East Village,40.72322,-73.98615,Private room,55,2,207,132,25


In [4]:
# Create a new feature with the average latitude and longitude
listings_df[['latitude', 'longitude']].mean()

latitude     40.727938
longitude   -73.944307
dtype: float64

In [5]:
# Drop unnecessary columns
listings_df=listings_df.drop(columns=['id', 'name', 'host_name', 'neighborhood', 'borough'])
listings_df.head()

Unnamed: 0,host_id,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review
0,417504,40.73777,-73.95366,Hotel room,308,2,42,217,51
1,2787,40.64529,-73.97238,Private room,299,30,9,356,1419
2,204539,40.70309,-73.89963,Entire home/apt,425,30,13,365,1030
3,867225,40.8063,-73.95985,Private room,75,31,135,219,58
4,434987,40.72322,-73.98615,Private room,55,2,207,132,25


In [6]:
num_reviews_host=listings_df.groupby('host_id')['number_of_reviews'].sum()
num_reviews_host.name='num_reviews_host'

In [7]:
num_listings=listings_df.groupby('host_id').size()
num_listings.name='num_listings'

In [8]:
listings_df=listings_df.join(num_listings, on='host_id', how='outer').join(num_reviews_host, on='host_id', how='outer')
listings_df.head()

Unnamed: 0,host_id,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_reviews_host
0,417504,40.73777,-73.95366,Hotel room,308,2,42,217,51,30,1366
10,417504,40.73756,-73.9535,Hotel room,529,2,10,100,298,30,1366
11,417504,40.73777,-73.95366,Private room,372,2,9,222,2,30,1366
13,417504,40.73777,-73.95366,Private room,372,2,2,222,4136,30,1366
22,417504,40.73777,-73.95366,Hotel room,308,2,22,221,88,30,1366


In [9]:
listings_df=listings_df.drop(columns=['host_id'])
listings_df.head()

Unnamed: 0,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_reviews_host
0,40.73777,-73.95366,Hotel room,308,2,42,217,51,30,1366
10,40.73756,-73.9535,Hotel room,529,2,10,100,298,30,1366
11,40.73777,-73.95366,Private room,372,2,9,222,2,30,1366
13,40.73777,-73.95366,Private room,372,2,2,222,4136,30,1366
22,40.73777,-73.95366,Hotel room,308,2,22,221,88,30,1366


In [10]:
cat_df=pd.get_dummies(listings_df, columns=['room_type'])
cat_df.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_reviews_host,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,40.73777,-73.95366,308,2,42,217,51,30,1366,0,1,0,0
10,40.73756,-73.9535,529,2,10,100,298,30,1366,0,1,0,0
11,40.73777,-73.95366,372,2,9,222,2,30,1366,0,0,1,0
13,40.73777,-73.95366,372,2,2,222,4136,30,1366,0,0,1,0
22,40.73777,-73.95366,308,2,22,221,88,30,1366,0,1,0,0


In [11]:
avg_lat, avg_long=cat_df[['latitude', 'longitude']].mean().values

In [12]:
# Create new column of distance
cat_df['distance']=((cat_df['latitude']-avg_lat)**2+(cat_df['longitude']-avg_long)**2)**.5
cat_df

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_reviews_host,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,distance
0,40.737770,-73.953660,308,2,42,217,51,30,1366,0,1,0,0,0.013570
10,40.737560,-73.953500,529,2,10,100,298,30,1366,0,1,0,0,0.013307
11,40.737770,-73.953660,372,2,9,222,2,30,1366,0,0,1,0,0.013570
13,40.737770,-73.953660,372,2,2,222,4136,30,1366,0,0,1,0,0.013570
22,40.737770,-73.953660,308,2,22,221,88,30,1366,0,1,0,0,0.013570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31499,40.655910,-73.998570,145,3,14,0,12,1,14,1,0,0,0,0.090181
31500,40.740227,-74.027504,410,3,2,0,40,1,2,1,0,0,0,0.084099
31502,40.647244,-73.720088,180,1,5,0,14,1,5,1,0,0,0,0.238298
31503,40.787320,-74.004470,240,5,18,152,16,1,18,1,0,0,0,0.084533


In [13]:
# Drop unnecessary columns
cat_df=cat_df.drop(columns=['latitude', 'longitude'])
cat_df

Unnamed: 0,price,minimum_nights,number_of_reviews,availability_365,days_since_last_review,num_listings,num_reviews_host,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,distance
0,308,2,42,217,51,30,1366,0,1,0,0,0.013570
10,529,2,10,100,298,30,1366,0,1,0,0,0.013307
11,372,2,9,222,2,30,1366,0,0,1,0,0.013570
13,372,2,2,222,4136,30,1366,0,0,1,0,0.013570
22,308,2,22,221,88,30,1366,0,1,0,0,0.013570
...,...,...,...,...,...,...,...,...,...,...,...,...
31499,145,3,14,0,12,1,14,1,0,0,0,0.090181
31500,410,3,2,0,40,1,2,1,0,0,0,0.084099
31502,180,1,5,0,14,1,5,1,0,0,0,0.238298
31503,240,5,18,152,16,1,18,1,0,0,0,0.084533


In [14]:
# Format and assign variables
X=cat_df.drop(columns=['price'])
y=cat_df['price']

In [15]:
# Split into training and testing sets
X_train, X_test, y_train, y_test=train_test_split(X, y)

In [16]:
# Scale the data
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [17]:
# Create and train the model
rf_valid=RandomForestRegressor()
rf_valid.fit(X_train_scaled, y_train)

RandomForestRegressor()

In [18]:
# Generate the scores
print(f'Train R2 Score: {rf_valid.score(X_train_scaled, y_train)}')
print(f'Test R2 Score: {rf_valid.score(X_test_scaled, y_test)}')

Train R2 Score: 0.8605534917728701
Test R2 Score: 0.15982996888373813


In [19]:
# Sort the features and their importance
sorted(list(zip(rf_valid.feature_importances_, X.columns)), reverse=True)

[(0.268309974796581, 'distance'),
 (0.25895404297126207, 'days_since_last_review'),
 (0.11379869769881382, 'availability_365'),
 (0.11224307892401347, 'num_reviews_host'),
 (0.08084123332242471, 'number_of_reviews'),
 (0.06992185851943888, 'minimum_nights'),
 (0.03604344953649665, 'num_listings'),
 (0.03520907865316375, 'room_type_Private room'),
 (0.01442784153345944, 'room_type_Entire home/apt'),
 (0.00935135463262473, 'room_type_Shared room'),
 (0.000899389411721584, 'room_type_Hotel room')]

### Run a Model Price Prediction 

In [20]:
X_train.columns

Index(['minimum_nights', 'number_of_reviews', 'availability_365',
       'days_since_last_review', 'num_listings', 'num_reviews_host',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room', 'distance'],
      dtype='object')

In [21]:
user_input = {
'minimum_nights': 3, 
'number_of_reviews': 5, 
'availability_365': 30,
'days_since_last_review': 20, 
'num_listings': 1, 
'num_reviews_host': 10,
'room_type_Entire home/apt':1, 
'room_type_Hotel room': 0,
'room_type_Private room': 0,
'room_type_Shared room': 0,
'distance': .01
}

In [22]:
list(user_input.values())

[3, 5, 30, 20, 1, 10, 1, 0, 0, 0, 0.01]

In [23]:
scaled_user_input=scaler.transform([list(user_input.values())])
scaled_user_input

  "X does not have valid feature names, but"


array([[-0.47073077, -0.47151773, -0.74858577, -0.67863355, -0.28653664,
        -0.24357778,  0.84471284, -0.07054356, -0.81389295, -0.11473492,
        -1.34187358]])

In [24]:
rf_valid.predict(scaled_user_input)

array([317.89])