In [42]:
import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib.pyplot as plt

In [43]:
df = pd.read_csv('AB_NYC_2019.csv')

In [44]:
feature_list = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

In [45]:
df = df[feature_list].fillna(0)

In [46]:
df.head(1)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365


### Q1: What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [47]:
df.neighbourhood_group.value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
df_full_train, df_test = train_test_split(df ,train_size=0.8, random_state=42)

In [50]:
df_train, df_val = train_test_split(df_full_train, train_size=0.75, random_state=42)

In [51]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [52]:
df_train.shape[0], df_val.shape[0], df_test.shape[0]

(29337, 9779, 9779)

In [53]:
y_full_train = df_full_train.price.values
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [54]:
del df_full_train['price']
del df_train['price']
del df_val['price']
del df_test['price']

In [55]:
df_train.head(1)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Entire home/apt,40.7276,-73.94495,3,29,0.7,13,50


In [56]:
df_train.corr().abs()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,0.006246,0.007159,0.019375,0.005891
longitude,0.080301,1.0,0.06066,0.055084,0.134642,0.117041,0.083666
minimum_nights,0.027441,0.06066,1.0,0.07602,0.120703,0.118647,0.138901
number_of_reviews,0.006246,0.055084,0.07602,1.0,0.590374,0.073167,0.174477
reviews_per_month,0.007159,0.134642,0.120703,0.590374,1.0,0.048767,0.165376
calculated_host_listings_count,0.019375,0.117041,0.118647,0.073167,0.048767,1.0,0.225913
availability_365,0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [57]:
from sklearn.metrics import mutual_info_score

In [58]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)
numerical = list(df_train.dtypes[df_train.dtypes != 'object'].index)

In [59]:
y_full_train = (y_full_train >= 152).astype(int)
y_train = (y_train >= 152).astype(int)
y_val = (y_val >= 152).astype(int)
y_test = (y_test >= 152).astype(int)

In [64]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [68]:
mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False).round(2)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

In [70]:
round(mi.sort_values(ascending=False)[0], 2)

0.14

In [71]:
from sklearn.feature_extraction import DictVectorizer 

In [73]:
dv = DictVectorizer()
X_train = dv.fit_transform(df_train.to_dict(orient='records'))

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

In [77]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [78]:
X_val = dv.transform(df_val.to_dict(orient='records'))

In [85]:
all_features_acc = (((model.predict_proba(X_val)[:,1] >= 0.5) == y_val).mean()).round(2)

In [97]:
def train_without_feature_get_acc(df_train, df_val, feature):
    df_train_copy = df_train.copy()
    df_val_copy = df_val.copy()
    
    del df_train_copy[feature]
    del df_val_copy[feature]
    
    dv = DictVectorizer()
    X_train = dv.fit_transform(df_train_copy.to_dict(orient='records'))
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(X_train, y_train)
    
    X_val = dv.transform(df_val_copy.to_dict(orient='records'))
    
    return (((model.predict_proba(X_val)[:,1] >= 0.5) == y_val).mean())

In [98]:
features_diff_acc = {}
for column in df_train.columns:
    features_diff_acc[column] = all_features_acc - train_without_feature_get_acc(df_train, df_val, column)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [104]:
{k: features_diff_acc[k] for k in ['neighbourhood_group', 'room_type',
'number_of_reviews',
'reviews_per_month']}

{'neighbourhood_group': 0.03900296553839866,
 'room_type': 0.07397586665303202,
 'number_of_reviews': 0.0034165047550874794,
 'reviews_per_month': 0.004541364147663374}

In [106]:
from sklearn import linear_model


In [110]:
df.price = np.log1p(df.price)

In [118]:
df_full_train, df_test = train_test_split(df ,train_size=0.8, random_state=42)
df_train, df_val = train_test_split(df_full_train, train_size=0.75, random_state=42)

In [119]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [120]:
y_full_train = df_full_train.price.values
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [121]:
dv = DictVectorizer()

In [123]:
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))

In [126]:
from sklearn.metrics import mean_squared_error as rmse

In [128]:
for alph in [0, 0.01, 0.1, 1, 10]:
    ridge_reg_model = linear_model.Ridge(alpha=alph)
    ridge_reg_model.fit(X_train, y_train)
    y_pred = ridge_reg_model.predict(X_val)
    print(alph, '\t',rmse(y_pred, y_val))

0 	 0.0003138908831317748
0.01 	 0.00031400237972439767
0.1 	 0.0003139029743098848
1 	 0.0003139244973982071
10 	 0.00031456478307205255
