In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

In [57]:
usecols = ['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'price','minimum_nights',
        'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'price']
df = pd.read_csv('AB_NYC_2019.csv', usecols=usecols)

In [62]:
df.reviews_per_month = df.reviews_per_month.fillna(0)
df.isna().sum()

neighbourhood_group               0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## Question 1

In [63]:
df.neighbourhood_group.mode()

0    Manhattan
dtype: object

#### Answer: manhattan

## Data split

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

(39116, 9779)

In [66]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val)

(29337, 9779)

In [72]:
y_full_train = df_full_train.price.values
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [73]:
del df_full_train['price']
del df_train['price']

## Question 2

In [82]:
numerical_values = df_train.dtypes[df_train.dtypes != 'object'].index
numerical_values

Index(['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [89]:
corr_matrix_df = pd.DataFrame()

for n in numerical_values:
    corr_matrix_df[n] = df_train.corrwith(df_train[n])

corr_matrix_df

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [118]:
c = corr_matrix_df.abs()
s = c.unstack()
so = s.sort_values(kind='quicksort', ascending=False)
so[so < 1 - 0.01][:2]

number_of_reviews  reviews_per_month    0.590374
reviews_per_month  number_of_reviews    0.590374
dtype: float64

## Make price binary

In [145]:
above_average = (y_train >= 152).astype('int')
above_average

array([0, 0, 0, ..., 1, 0, 0])

## Question 3

In [131]:
from sklearn.metrics import mutual_info_score

In [143]:
def mutual_info_above_price_score(series):
    return mutual_info_score(series, above_average)

In [153]:
categorical_values = df_train.dtypes[df_train.dtypes == 'object'].index

mi = df_train[categorical_values].apply(mutual_info_above_price_score)
mi.sort_values(ascending=False)

room_type              0.143226
neighbourhood_group    0.046506
dtype: float64

In [154]:
round(mi.sort_values(ascending=False)[0], 2)

0.14

## Question 4

In [155]:
from sklearn.feature_extraction import DictVectorizer

In [156]:
categorical_values, numerical_values

(Index(['neighbourhood_group', 'room_type'], dtype='object'),
 Index(['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
        'reviews_per_month', 'calculated_host_listings_count',
        'availability_365'],
       dtype='object'))

In [163]:
dv = DictVectorizer(sparse=False)
train_dicts = df_train[list(categorical_values) + list(numerical_values)].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[list(categorical_values) + list(numerical_values)].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [164]:
from sklearn.linear_model import LogisticRegression

In [181]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, above_average)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [185]:
above_average_val = (y_val >= 152).astype('int')
y_pred = model.predict(X_val)

df_pred = pd.DataFrame()
df_pred['prediction'] = y_pred
df_pred['actual'] = above_average_val
df_pred['correct'] = df_pred.prediction == df_pred.actual

original_accuracy = df_pred.correct.mean()
round(original_accuracy, 2)

0.79

## Question 5

In [197]:
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [230]:
to_eliminate_features = list(categorical_values) + list(numerical_values)
features_len = len(to_eliminate_features)
accuracies = []

for i in range(features_len):
    features_to_fetch = to_eliminate_features.copy()
    features_to_fetch.pop(i)
    
    dv = DictVectorizer(sparse=False)
    train_dicts = df_train[features_to_fetch].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)

    val_dicts = df_val[features_to_fetch].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(X_train, above_average)
    
    y_pred = model.predict(X_val)
    
    accuracy = accuracy_score(above_average_val, y_pred)
    accuracies.append(accuracy)

df_results = pd.DataFrame()
df_results['feature'] = to_eliminate_features
df_results['accuracy'] = accuracies
df_results['diff'] = original_accuracy - df_results.accuracy
df_results

Unnamed: 0,feature,accuracy,diff
0,neighbourhood_group,0.750997,0.035484
1,room_type,0.715206,0.071275
2,latitude,0.786379,0.000102
3,longitude,0.786788,-0.000307
4,minimum_nights,0.785663,0.000818
5,number_of_reviews,0.787095,-0.000614
6,reviews_per_month,0.78505,0.001432
7,calculated_host_listings_count,0.786686,-0.000205
8,availability_365,0.781573,0.004908


In [210]:
df_results.loc[df_results['diff'].abs().sort_values(ascending=True).index].feature

2                          latitude
7    calculated_host_listings_count
3                         longitude
5                 number_of_reviews
4                    minimum_nights
6                 reviews_per_month
8                  availability_365
0               neighbourhood_group
1                         room_type
Name: feature, dtype: object

#### Question 5 nswer: number_of_reviews

## Question 6

In [213]:
from sklearn.linear_model import Ridge

In [217]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [218]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [228]:
dv = DictVectorizer(sparse=False)
train_dicts = df_train[list(categorical_values) + list(numerical_values)].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[list(categorical_values) + list(numerical_values)].to_dict(orient='records')
X_val = dv.transform(val_dicts)

scores = []

for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, random_state=42)
    model.fit(X_train, y_train_log)
    
    y_pred = model.predict(X_val)
    
    score = rmse(y_val_log, y_pred)
    scores.append(score)
    print(a, round(score, 3))

0 0.497
0.01 0.497
0.1 0.497
1 0.497
10 0.498


In [229]:
np.array(scores).argmin()

0