# Airbnb New York City 2019
## Business Data Analytics, Quantitative Methods and Visualization - Exam

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


## Constructing the DataFrame

In [None]:
df = pd.read_csv( "AB_NYC_2019.csv" )
df.head()

In [None]:
df.info()

In [None]:
# Checking if the attributes types are correct and usefull
df.dtypes

In [None]:
# Check if values contains NaN
df.isnull().sum()

# As you can see, there are 10052 NaN values

In [None]:
# Reviews_per_month might contain "NaN" and we can't use that and therefore its replaced with a 0.
df.fillna( { 'reviews_per_month':0 }, inplace = True )

In [None]:
"""
Unnecessary columns:
host_id is not needed since we have the name.
Latitude and longtitude is not necessary due to the fact we already have neighbourgood_group.
last_review does not affect prices
name does not affect the prices
host_name does not affect the prices
"""
df.drop( ['id', 'host_id', 'name', 'host_name', 'latitude', 'longitude', 'last_review'], axis = 1, inplace = True )

# Checking if they're dropped
df.info()

# We can see we have 48.895 rows and 12 columns left

In [None]:
# Checks how many rows where the price is above 1000
df[df["price"] > 1000]

In [None]:
# Checks how many rows where the price is above 500
df[df["price"] > 500]

In [None]:
# Checks how many rows where the price is above 300
df[df["price"] > 300]

In [None]:
# Checks how many rows where the price is above 200
df[df["price"] > 200]

In [None]:
# Only use rows if the price is below 300
df = df[df["price"] < 300]

In [None]:
df['minimum_nights'].describe()

In [None]:
# It is likely to assume that 1250 minimum_nights will influence our model
df["minimum_nights"][df["minimum_nights"] > 25]

In [None]:
df["minimum_nights"][df["minimum_nights"] > 30]

In [None]:
df["minimum_nights"][df["minimum_nights"] > 35]

In [None]:
# We're now setting every value in minimum_nights that is above 35 to 35.
df.loc[(df.minimum_nights > 35),"minimum_nights"] = 35

In [None]:
# Checking unique neighbourhoods
df['neighbourhood'].value_counts()

In [None]:
df.shape

## Heatmap

In [None]:
# Creating dummies - converts the strings within the DataFrame to integers that can be used for our models
df_test = df.drop( ['neighbourhood'], axis = 1 )
df      = pd.get_dummies( df )

In [None]:
# Constructing a new heatmap with our dummy values. We have left our neighbourhood on purpose due to the fact it contains way too many data points
fig, ax = plt.subplots( figsize = (20, 10) )
sns.color_palette( "husl", 8 )
sns.heatmap( df_test.corr(), annot = True, linewidths = .5, ax = ax )

In [None]:
# The price column is our target
X = df.loc[:, df.columns != 'price']
y = df['price']

# Split the wave dataset into a training and a test set
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 0 )

In [None]:
df.shape

## Machine learning models

### KNeighborsRegressor

In [None]:
knreg = KNeighborsRegressor().fit( X_train, y_train )
print( "Training set score: {:.3f}".format( knreg.score( X_train, y_train ) ) )
print( "Test set score: {:.3f}".format( knreg.score( X_test, y_test ) ) )

### KNeighborsRegressor (tuned)

In [None]:
neighbours      = [85, 90, 95]
distances       = ['uniform', 'distance']
accuracy_output = []
training_scores = []
test_scores     = []

for neighbour in neighbours: 
    for distance in distances:
        knreg = KNeighborsRegressor( n_neighbors = neighbour ) 
        knreg.fit( X_train, y_train )
        training = knreg.score( X_train, y_train )
        tests    = knreg.score( X_test, y_test )
        accuracy_output.append( (neighbour, distance, tests, training) )

for idx, key in enumerate( sorted ( accuracy_output, reverse = True, key = lambda e:e[2] ) ):
    print(f"Neighbours {key[0]}, weight used {key[1]}, test accuracy: {key[2]}, train accuracy: {key[3]} \n")
    
    if idx == 0:
        training_scores.append( key[3] )
        test_scores.append( key[2] )

### LinearRegression

In [None]:
lr = LinearRegression().fit( X_train, y_train )
training_score = lr.score( X_train, y_train )
test_score = lr.score( X_test, y_test )

print( "Training set score: {:.2f}".format( training_score ) )
print( "Test set score: {:.2f}".format( test_score ) )
# Might be underfitted, or data just doesn't allow for better accuracy

### Ridge

In [None]:
ridge = Ridge().fit( X_train, y_train )
print( "Training set score: {:.3f}".format( ridge.score( X_train, y_train ) ) )
print( "Test set score: {:.3f}".format( ridge.score( X_test, y_test ) ) )
# Might be underfitted, or data just doesn't allow for better accuracy

### Ridge (tuned)

In [None]:
"""
Different inputs tried:
fit_int = [True, False]
normalize = [True, False]
alpha = [0.1, 1, 10, 100]
alpha = [1, 5, 10, 15]
"""
alpha           = [1, 5, 6, 7, 8, 9, 10, 11, 12]
accuracy_output = []

for a in alpha:
    ridge = Ridge( alpha = a ) 
    ridge.fit( X_train, y_train )
    training = ridge.score( X_train, y_train )
    tests    = ridge.score( X_test, y_test )
    accuracy_output.append( (a, tests, training) )

for idx, key in enumerate( sorted ( accuracy_output, reverse = True, key = lambda e:e[1] ) ):
    print(f"alpha: {key[0]}, test accuracy: {key[1]}, train accuracy: {key[2]} \n")
    
    if idx == 0:
        training_scores.append( key[2] )
        test_scores.append( key[1] )

### Lasso

In [None]:
lasso = Lasso().fit( X_train, y_train )
print( "Training set score: {:.3f}".format( lasso.score( X_train, y_train ) ) )
print( "Test set score: {:.3f}".format( lasso.score( X_test, y_test ) ) )
# print("Number of features used:", np.sum(lasso.coef_ != 0))
# Might be underfitted

### Lasso (tuned)

In [None]:
# alpha = [0.0001, 0.001, 0.01, 0.1, 1.0]
# max_iter = [1000, 2000, 3000, 4000]
alpha           = [0.0001, 0.001]
accuracy_output = []

for a in alpha:
    lasso = Lasso( alpha = a, max_iter = 50000 ) 
    lasso.fit( X_train, y_train )
    training = lasso.score( X_train, y_train )
    tests    = lasso.score( X_test, y_test )
    accuracy_output.append( (a, tests, training) )

for idx, key in enumerate( sorted ( accuracy_output, reverse = True, key = lambda e:e[1] ) ):
    print(f"alpha: {key[0]}, test accuracy: {key[1]}, train accuracy: {key[2]} \n")
    
    if idx == 0:
        training_scores.append( key[2] )
        test_scores.append( key[1] )

### DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor().fit( X_train, y_train )
print( "Training set score: {:.3f}".format( tree.score( X_train, y_train ) ) )
print( "Test set score: {:.3f}".format( tree.score( X_test, y_test ) ) )
# Overfitted

### DecisionTreeRegressor (tuned)

In [None]:
max_depth         = [9, 10, 11, 12, 13, 14]
min_samples_split = [10, 200, 225, 250, 275, 300]
max_features      = ['auto', 'sqrt', 'log2']
accuracy_output   = []

for md in max_depth:
    for mss in min_samples_split:
        for mx in max_features:        
            tree = DecisionTreeRegressor( max_depth = md, min_samples_split = mss ) 
            tree.fit( X_train, y_train )
            training = tree.score( X_train, y_train )
            tests    = tree.score( X_test, y_test )
            accuracy_output.append( (md, mss, mx, tests, training) )

for idx, key in enumerate( sorted ( accuracy_output, reverse = True, key = lambda e:e[3] ) ):
    print(f"max depth: {key[0]}, min samples split: {key[1]}, max features {key[2]}, test accuracy: {key[3]}, train accuracy: {key[4]} \n")
    
    if idx == 0:
        training_scores.append( key[4] )
        test_scores.append( key[3] )

### RandomForestRegressor

In [None]:
rfr = RandomForestRegressor().fit( X_train, y_train )
print( "Training set score: {:.3f}".format( rfr.score( X_train, y_train ) ) )
print( "Test set score: {:.3f}".format( rfr.score( X_test, y_test ) ) )
# Might be overfitted

### RandomForestRegressor (tuning tests)

In [None]:
"""
max_depth         = [10, 50, 100]
n_estimators      = [300, 400, 500]
min_samples_split = [10, 200, 300]
max_features      = ['auto', 'sqrt', 'log2']
accuracy_output   = []

for md in max_depth:
    for ne in n_estimators:
        for mss in min_samples_split:
            for mx in max_features:        
                rfr = RandomForestRegressor( max_depth = md, n_estimators = ne, min_samples_split = mss ) 
                rfr.fit( X_train, y_train )
                training = rfr.score( X_train, y_train )
                tests = rfr.score( X_test, y_test )
                accuracy_output.append( (md, ne, mss, mx, tests, training) )
for key in sorted ( accuracy_output, reverse = True, key = lambda e:e[4] ):
    print(f"max depth: {key[0]}, n_estimators: {key[1]} min samples split: {key[2]}, max features {key[3]}, test accuracy: {key[4]}, train accuracy: {key[5]} \n")
"""

### RandomForestRegressor (tuned)

In [None]:
# Optimal tuning found through manual trial and error
rfr = RandomForestRegressor( max_depth = 100, n_estimators = 550, min_samples_split = 50 ) 
rfr.fit( X_train, y_train )

train_score = rfr.score( X_train, y_train )
test_score  = rfr.score( X_test, y_test )

print( "Training set score: {:.3f}".format( train_score ) )
print( "Test set score: {:.3f}".format( test_score ) )

training_scores.append( train_score )
test_scores.append( test_score )

### RandomForestRegressor Feature Importances

In [None]:
features    = df_test.columns
importances = rfr.feature_importances_
indices     = np.argsort( importances )

plt.subplots( figsize = (20, 20) )
plt.title( 'Feature Importances' )
plt.barh( range( len( indices ) ), importances[indices], color = 'g', align = 'center' )
plt.yticks( range( len( indices ) ), features[indices] )
plt.xlabel( 'Importance' )
plt.show()

## Test and training results visualized

In [None]:
index     = np.arange( 6 )
bar_width = 0.5

fig, ax  = plt.subplots( figsize = (15, 10) )
training = ax.bar( index, training_scores, bar_width, label = "Training accuracy" )
testing  = ax.bar( index + bar_width, test_scores, bar_width, label = "Test accuracy")

ax.set_xlabel( 'Models' )
ax.set_ylabel( 'Accuracy' )
ax.set_title( 'Comparing our models' )
ax.set_xticks( index + bar_width / 2 )
ax.set_xticklabels( ["KNeighborsRegressor", "LinearRegression", "Ridge", "Lasso", "DecisionTreeRegressor", "RandomForestRegressor"] )
ax.legend()

plt.show()