In [1]:
import os
import pandas as pd
import numpy as np
import math 
import matplotlib.pyplot as plt 
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from datetime import datetime

In [2]:
df = pd.read_csv(os.path.join("..", "nycairbnbtrain.csv"))
df_test = pd.read_csv(os.path.join("..", "nycairbnbtest.csv"))
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9901706,Cute big one bedroom,1904415,Natalie,Manhattan,Upper West Side,40.77789,-73.97701,Entire home/apt,180,1,0,,,1,0
1,299531,Feel like you never leave your home,1220404,Tom,Brooklyn,East New York,40.66795,-73.89232,Entire home/apt,100,1,119,6/30/2019,1.39,2,289
2,2461439,Pristine Lower East Side Sanctuary,12586492,Sausan,Manhattan,Lower East Side,40.72007,-73.98946,Entire home/apt,133,14,177,5/3/2019,2.82,2,221
3,127387,"Luxe, Spacious 2BR 2BA Nr Trains",23276,Katharine,Brooklyn,Gowanus,40.66862,-73.9926,Entire home/apt,260,30,3,8/4/2014,0.03,1,316
4,629315,1BD brownstone apt in Fort Greene!,2397437,Lauren,Brooklyn,Fort Greene,40.68935,-73.9695,Entire home/apt,120,3,22,10/28/2015,0.27,1,189


In [3]:
def datetime_to_float(d):
    epoch = datetime.utcfromtimestamp(0)
    total_seconds =  (d - epoch).total_seconds()
    # total_seconds will be in decimals (millisecond precision)
    return total_seconds

def float_to_datetime(fl):
    return datetime.fromtimestamp(fl)

In [4]:
# Modifying columns to be more machine readible

def makeAdjustments(df):
    
    # Dropping airbnbs without reviews
    df = df.dropna()
    
    # name --> length of name
    df['name'] =  [str(x) for x in df['name']]
    df['name_len'] = df['name'].apply(lambda x: len(x))
    
    # host name --> length of host name
    df['host_name'] =  [str(x) for x in df['name']]
    df['host_name_len'] = df['host_name'].apply(lambda x: len(x))
        
    df.drop(['name', 'host_name'], axis=1, inplace=True)
    
    # Assigning dummies to borough (neighborhood group), neighborhood, room type
    cat_variables = df[['neighbourhood_group', 'neighbourhood', 'room_type']]
    cat_dummies = pd.get_dummies(cat_variables)
    df.drop(['neighbourhood_group', 'neighbourhood', 'room_type'], axis=1, inplace=True)
    df = pd.concat([df, cat_dummies], axis=1)
    
    # Changing last review to datetime then float
    df['last_review'] =  pd.to_datetime(df['last_review'])
    
    df['last_review'] =  pd.to_datetime(df['last_review'])
    last_review_fl = []
    for i in df['last_review']:
        last_review_fl.append(datetime_to_float(i))
    df['last_review_fl'] = last_review_fl
    
    df.drop(['last_review'], axis=1, inplace=True)
    
    return df

In [None]:
df = makeAdjustments(df)

y = df['price']
data = df.drop("price", axis=1)
feature_names = data.columns
X = df[feature_names]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [9]:
# to see which variables provide highest coefficients
clf = Lasso(alpha=0.1)
clf.fit(X_train, y_train)
coefficients = clf.coef_
variables = X_train.columns

coef = pd.DataFrame()
coef['Variables'] = variables
coef['Coefficients'] = coefficients
coef = coef.sort_values('Coefficients', ascending = False)
coef.head(20)

Unnamed: 0,Variables,Coefficients
208,neighbourhood_Tribeca,280.148662
230,room_type_Entire home/apt,98.928166
56,neighbourhood_Clinton Hill,77.807324
13,neighbourhood_group_Manhattan,74.798685
189,neighbourhood_SoHo,73.587977
141,neighbourhood_Midtown,65.087439
106,neighbourhood_Greenwich Village,53.269715
90,neighbourhood_Flatiron District,48.329247
163,neighbourhood_Park Slope,32.550061
40,neighbourhood_Brooklyn Heights,31.657586


In [None]:
# ===============================================================================================================================================================================
# https://towardsdatascience.com/ridge-and-lasso-regression-a-complete-guide-with-python-scikit-learn-e20e34bcbf0b

lasso = Lasso()
lasso.fit(X_train,y_train)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)
print ("training score:", train_score)
print ("test score: ", test_score)
print ("number of features used: ", coeff_used)
lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X_train,y_train)
train_score001=lasso001.score(X_train,y_train)
test_score001=lasso001.score(X_test,y_test)
coeff_used001 = np.sum(lasso001.coef_!=0)
print ("training score for alpha=0.01:", train_score001)
print ("test score for alpha =0.01: ", test_score001)
print ("number of features used: for alpha =0.01:", coeff_used001)
lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(X_train,y_train)
train_score00001=lasso00001.score(X_train,y_train)
test_score00001=lasso00001.score(X_test,y_test)
coeff_used00001 = np.sum(lasso00001.coef_!=0)
print ("training score for alpha=0.0001:", train_score00001)
print ("test score for alpha =0.0001: ", test_score00001)
print ("number of features used: for alpha =0.0001:", coeff_used00001)
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_train_score=lr.score(X_train,y_train)
lr_test_score=lr.score(X_test,y_test)
print ("LR training score:", lr_train_score)
print ("LR test score: ", lr_test_score)
plt.subplot(1,2,1)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency

plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.subplot(1,2,2)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=6,color='black',label=r'Lasso; $\alpha = 0.00001$') # alpha here is for transparency
plt.plot(lr.coef_,alpha=0.7,linestyle='none',marker='o',markersize=5,color='green',label='Linear Regression',zorder=2)
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.tight_layout()
plt.show()