In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('AB_NYC_2019.csv')

In [3]:
col_use=['neighbourhood_group','room_type','latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']
df[col_use].dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [4]:
categorical=list( df[col_use].dtypes[df[col_use].dtypes=='object'].index )
numerical=list( df[col_use].dtypes[(df[col_use].dtypes=='float64') |  (df[col_use].dtypes=='int64') ].index )
numerical.remove('price')

In [5]:
#Question 1
df['neighbourhood_group'].value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [24]:
np.random.seed(42)
df=df.fillna(0)
df_full_train,df_test= train_test_split(df,test_size=0.2,random_state=1)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=1)


y_train=df_train.price.values
y_val=df_val.price.values
y_test=df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [25]:
#Question 2
df_train[numerical].corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.084313,0.019638,-0.020536,-0.025427,0.019039,-0.012974
longitude,0.084313,1.0,-0.062937,0.058299,0.136407,-0.117029,0.082892
minimum_nights,0.019638,-0.062937,1.0,-0.075564,-0.120368,0.120308,0.140542
number_of_reviews,-0.020536,0.058299,-0.075564,1.0,0.599733,-0.072518,0.173308
reviews_per_month,-0.025427,0.136407,-0.120368,0.599733,1.0,-0.045017,0.168808
calculated_host_listings_count,0.019039,-0.117029,0.120308,-0.072518,-0.045017,1.0,0.229346
availability_365,-0.012974,0.082892,0.140542,0.173308,0.168808,0.229346,1.0


In [48]:
case=[['calculated_host_listings_count','availability_365']
     ,['number_of_reviews', 'availability_365']
     ,['number_of_reviews', 'reviews_per_month']
     ,['minimum_nights', 'calculated_host_listings_count']]

for i in range(4):
    print('correlation beetween %s and %s :'%(case[i][0],case[i][1]) + str( df_train[numerical].corr().loc[case[i][0],case[i][1]] ) )

correlation beetween calculated_host_listings_count and availability_365 :0.22934634501783852
correlation beetween number_of_reviews and availability_365 :0.1733077768589699
correlation beetween number_of_reviews and reviews_per_month :0.5997331133807586
correlation beetween minimum_nights and calculated_host_listings_count :0.12030751012784173


In [26]:
#make price binary
above_average=(y_train>=152).astype('int')
above_average_val=(y_val>=152).astype('int')

In [27]:
#Question 3
from sklearn.metrics import mutual_info_score

for col_categorical in categorical:
    print('%s'%col_categorical+' and binary price: '+ str( round(mutual_info_score(above_average,df_train[col_categorical]),2)) )
          

neighbourhood_group and binary price: 0.05
room_type and binary price: 0.14


In [28]:
#Question 4
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

dv=DictVectorizer(sparse=False)

train_dicts=df_train[categorical+numerical].to_dict(orient='records')
val_dicts=df_val[categorical+numerical].to_dict(orient='records')

X_train=dv.fit_transform(train_dicts)
X_val=dv.fit_transform(val_dicts)

# model=LogisticRegression(solver='lbfgs',C=1,random_state=42)
model=LogisticRegression()
model.fit(X_train,above_average)


LogisticRegression()

In [29]:
y_pred=model.predict(X_val)
original_accuracy=round( (y_pred==above_average_val).mean() ,2 ) 
print(original_accuracy)

0.79


In [30]:
dummy=categorical+numerical
dv=DictVectorizer()
accuracy=[]
for feature_unuse in categorical+numerical:
    
    dummy.remove(feature_unuse)
    train_dicts=df_train[dummy].to_dict(orient='records')
    val_dicts=df_val[dummy].to_dict(orient='records')
    
    X_train=dv.fit_transform(train_dicts)
    X_val=dv.fit_transform(val_dicts)
    
    model=LogisticRegression()
    model.fit(X_train,above_average)
    
    y_pred=model.predict(X_val)
    accuracy_now=(y_pred==above_average_val).mean()
    
    accuracy.append(accuracy_now)
    
    dummy=categorical+numerical

    
accuracy=np.array(accuracy) 
difference=np.abs(original_accuracy-accuracy)

In [14]:
smallest_difference_feature=(categorical+numerical)[ difference.argmin() ]
smallest_difference_feature

'reviews_per_month'

In [42]:
from sklearn.linear_model import Ridge

list_alpha=[0, 0.01, 0.1, 1, 10]

train_dicts=df_train[categorical+numerical].to_dict(orient='records')
val_dicts=df_val[categorical+numerical].to_dict(orient='records')

X_train=dv.fit_transform(train_dicts)
X_val=dv.fit_transform(val_dicts)

log_train=np.log1p(y_train)
log_val=np.log1p(y_val)

def rmse(y,yhat):
    sum_square=(y-yhat)**2
    return sum_square.mean()

list_rmse=[]

for alpha in list_alpha:
    
    model=Ridge(alpha=alpha)
    model.fit(X_train,log_train)
    
    y_pred=model.predict(X_val)
    rmse_now=round( rmse(log_val,y_pred) ,3 )
    
    list_rmse.append(rmse_now)
    
    
    
    

In [43]:
list_alpha[np.array(list_rmse).argmin()]

0