# Session #3 Homework

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import mean_squared_error

In [2]:
# Load dataset
df = pd.read_csv("AB_NYC_2019.csv").reset_index()

## Features

In [3]:
# Only select specified columns
columns = ["neighbourhood_group", "room_type", "latitude", "longitude", "price", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365"]
df = df[columns]
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [4]:
# Check missing values
df.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [5]:
# Fill missing values with 0
df.fillna(value={"reviews_per_month": 0}, inplace=True)

## Question 1
What is the most frequent observation (mode) for the column `neighbourhood_group`?

In [6]:
df["neighbourhood_group"].value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

**Manhattan** is the most frequent observation for the column `neighbourhood_group`.

In [7]:
# Create a variable above_average
df["above_average"] = (df["price"] >= 152).astype(int)

In [8]:
# Split your data in train/val/test sets, with 60%/20%/20% distribution
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train[["price", "above_average"]]
y_val = df_val[["price", "above_average"]]
y_test = df_test[["price", "above_average"]]

# Delete target variable `price` and `above_average` from the dataframes
df_train.drop(["price", "above_average"], axis=1, inplace=True)
df_val.drop(["price", "above_average"], axis=1, inplace=True)
df_test.drop(["price", "above_average"], axis=1, inplace=True)

## Question 2
Create the correlation matrix for the numerical features of your train dataset.

In [9]:
df_train.iloc[:,2:].corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


What are the two features that have the biggest correlation in this dataset?

**reviews_per_month** and **number_of_reviews** have the biggest correlation, which is 0.59.

## Question 3
Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.

In [10]:
mutual_info_score(df_train["neighbourhood_group"], y_train["above_average"]).round(2)

0.05

In [11]:
mutual_info_score(df_train["room_type"], y_train["above_average"]).round(2)

0.14

**room_type** has bigger score.

## Question 4

In [12]:
# Apply one hot encoding
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient="records")
X_val = dv.fit_transform(val_dict)

In [13]:
# Normalize the array
X_train = normalize(X_train)
X_val = normalize(X_val)

In [14]:
# Train logistic regression
model = LogisticRegression(solver="lbfgs", C=1.0, random_state=42)
model.fit(X_train, y_train["above_average"])

LogisticRegression(random_state=42)

In [15]:
# Model intercept and coefficients
model.intercept_[0], model.coef_[0].round(3)

(-0.6676643961687555,
 array([  0.212,   3.323,  -0.471,   0.127,  -0.937,  -0.895,  -6.185,
         11.166,  -3.88 ,  -0.222,  -0.79 ,  -2.784,  21.159, -20.122,
         -1.053]))

In [16]:
# Model prediction on validation dataset
y_pred = model.predict_proba(X_val)[:, 1]

price_decision = (y_pred >= 152).astype(int)

In [17]:
# Calculate the accuracy on the validation dataset
accuracy = (price_decision == y_val["above_average"]).mean().round(2)
print(accuracy)

0.69


## Question 5

In [18]:
dv = DictVectorizer(sparse=False)

for column in columns:
    features = list(set(columns) - {column, "price"})
    df_train_ex = df_train[features]
    df_val_ex = df_val[features]
    
    # Apply one hot encoding    
    train_dict = df_train_ex.to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val_ex.to_dict(orient="records")
    X_val = dv.fit_transform(val_dict)
    
    # Normalize the array
    X_train = normalize(X_train)
    X_val = normalize(X_val)
    
    # Train logistic regression
    model = LogisticRegression(solver="lbfgs", C=1.0, random_state=42)
    model.fit(X_train, y_train["above_average"])
    
    # Model prediction on validation dataset
    y_pred = model.predict_proba(X_val)[:, 1]
    
    price_decision = (y_pred >= 0.5).astype(int)
    
    # Calculate the accuracy on the validation dataset
    accuracy_ex = (price_decision == y_val["above_average"]).mean()
    print("Exclude column: {}, Accuracy: {}, Difference: {}".format(column, accuracy_ex, np.abs(accuracy-accuracy_ex)))

Exclude column: neighbourhood_group, Accuracy: 0.7097862767154106, Difference: 0.019786276715410667
Exclude column: room_type, Accuracy: 0.707741077819818, Difference: 0.01774107781981804
Exclude column: latitude, Accuracy: 0.7097862767154106, Difference: 0.019786276715410667
Exclude column: longitude, Accuracy: 0.7110133960527661, Difference: 0.021013396052766198
Exclude column: price, Accuracy: 0.7097862767154106, Difference: 0.019786276715410667
Exclude column: minimum_nights, Accuracy: 0.7093772369362921, Difference: 0.01937723693629212
Exclude column: number_of_reviews, Accuracy: 0.7086614173228346, Difference: 0.01866141732283466
Exclude column: reviews_per_month, Accuracy: 0.7097862767154106, Difference: 0.019786276715410667
Exclude column: calculated_host_listings_count, Accuracy: 0.6934246855506698, Difference: 0.0034246855506698815
Exclude column: availability_365, Accuracy: 0.7098885366601903, Difference: 0.019888536660190304


Among `neighbourhood_group`, `room_type`, `number_of_reviews`, `reviews_per_month`, the feature with smallest difference is **room_type**.

## Question 6

In [19]:
# Apply logarithmic transformation on price
train_log_price = np.log1p(y_train["price"])
val_log_price = np.log1p(y_val["price"])

for alpha in [0, 0.01, 0.1, 1, 10]:
    # Train ridge regression
    model = Ridge(alpha=alpha)
    model.fit(X_train, train_log_price)

    # Model prediction on validation dataset
    y_pred = model.predict(X_val)

    # Calcualte the accuarcy on validation dataset
    print("Alpha: {}, Accuracy: {}".format(alpha, np.sqrt(mean_squared_error(y_pred, val_log_price)).round(3)))

Alpha: 0, Accuracy: 0.508
Alpha: 0.01, Accuracy: 0.513
Alpha: 0.1, Accuracy: 0.518
Alpha: 1, Accuracy: 0.551
Alpha: 10, Accuracy: 0.648


**Alpha = 0** leads to the best RMSE on the validation set.