# **Homework - Session 2**

## **Import libs**

In [1]:
import numpy as np
import pandas as pd

## **EDA**

### **Load the Data**

In [7]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv", sep=",")

In [6]:
data.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355


In [8]:
data.shape

(48895, 16)

### **`Price` column**

In [9]:
price = data.price

In [12]:
price.describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

## **Features**

In [13]:
df = data[['latitude',
            'longitude',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365',
            'price'
]]

## **Question 1**
Find a feature with missing values.   
How many missing values does it have?

In [20]:
df.isnull().sum()

latitude                              0
longitude                             0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
price                                 0
dtype: int64

In [21]:
df.isnull().sum().sum()

10052

## **Question 2**
What's the median (50% percentile) for variable 'minimum_nights'?

In [31]:
np.median(df.minimum_nights)

3.0

In [33]:
df.minimum_nights.describe()

count    48895.000000
mean         7.029962
std         20.510550
min          1.000000
25%          1.000000
50%          3.000000
75%          5.000000
max       1250.000000
Name: minimum_nights, dtype: float64

## **Split the data**

* Shuffle the initial dataset, use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Make sure that the target value ('price') is not in your dataframe.
* Apply the log transformation to the price variable using the `np.log1p()` function.

In [34]:
# Shuffle the initial dataset, use seed `42`
n= len(df)
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

# Split your data in train/val/test sets, with 60%/20%/20% distribution
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

# Reset indexes
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

# Apply the log transformation to the price variable using the `np.log1p()` function
y_train = np.log1p(df_train['price'].values)
y_val = np.log1p(df_val['price'].values)
y_test = np.log1p(df_test['price'].values)

# Make sure that the target value ('price') is not in your dataframe
df_train.drop('price', axis=1, inplace=True)
df_val.drop('price', axis=1, inplace=True)
df_test.drop('price', axis=1, inplace=True)

In [41]:
# Check
print(f"df_shape: {df.shape[0]}")

print(f"Check_shape: {df_train.shape[0] + df_test.shape[0] + df_val.shape[0]}")

df_shape: 48895
Check_shape: 48895


## **Question 3**

* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?


In [42]:
# Imputing Missing values with 0
def imputing_0(df):
    df = df.fillna(0)
    X = df.values
    return X

"""Imputing Missing values with Mean"""
def imputing_mean(df):
    df = df.fillna(0)
    X = df.values
    return X

In [43]:
# Linear regression model without regularization
def linear_regression_reg(X,y,r = 0.01):
    
    # biased term
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    # Gram Matrix
    XTX = X.T.dot(X)
    
    # Regularization
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    # inverse Gram Matrix
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:] 

def rmse(y, y_pred):
    error = y - y_pred
    squared = error ** 2
    mean_squared = squared.mean()
    return np.sqrt(mean_squared)

In [51]:
# Model and Prediction : missing value imputation using 0
X_train = imputing_0(df_train)

w0,w = linear_regression_reg(X_train,y_train,r = 0)

# Predict / train
y_pred_train = w0 + X_train.dot(w)

# Predict / val
X_val = imputing_0(df_val)
y_pred_val = w0 + X_val.dot(w)

# rmse
print(f"RMSE_train | missing value imputation using 0: {rmse(y_train, y_pred_train):.2f}")
print(f"RMSE_valid | missing value imputation using 0: {rmse(y_val, y_pred_val):.2f}")

RMSE_train | missing value imputation using 0: 0.64
RMSE_valid | missing value imputation using 0: 0.64


In [50]:
# Model and Prediction : missing value imputation using mean
X_train = imputing_mean(df_train)

w0,w = linear_regression_reg(X_train,y_train, r = 0)

# Predict / train
y_pred_train = w0 + X_train.dot(w)

# Predict / val
X_val = imputing_mean(df_val)
y_pred_val = w0 + X_val.dot(w)

# rmse
print(f"RMSE_train | missing value imputation using mean: {rmse(y_train, y_pred_train):.2f}")
print(f"RMSE_valid | missing value imputation using mean: {rmse(y_val, y_pred_val):.2f}")

RMSE_train | missing value imputation using mean: 0.64
RMSE_valid | missing value imputation using mean: 0.64


## **Question 4**

* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0. 
* Try different values of `r` from this list: `[0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]`.
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which `r` gives the best RMSE?

If there are multiple options, select the smallest `r`.

In [49]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0,w = linear_regression_reg(X_train,y_train,r)


    # Predicting on Validation set
    X_val = imputing_0(df_val)
    y_pred = w0 + X_val.dot(w)
    score =  (rmse(y_val, y_pred))

    # rmse
    print(f"r: {r}, w: {w0}, score: {score:.2f}")

r: 0, w: -419.91265872941676, score: 0.64
r: 1e-06, w: -419.86271567365674, score: 0.64
r: 0.0001, w: -414.97649243207684, score: 0.64
r: 0.001, w: -375.27365270305717, score: 0.64
r: 0.01, w: -191.783840532472, score: 0.66
r: 0.1, w: -32.562560552266376, score: 0.68
r: 1, w: -3.49921683772921, score: 0.68
r: 5, w: -0.7033623160382074, score: 0.68
r: 10, w: -0.35127675916757595, score: 0.68


## **Question 5**

* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores. 
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)   


> Note: Standard deviation shows how different the values are.
> If it's low, then all values are approximately the same.
> If it's high, the values are different. 
> If standard deviation of scores is low, then our model is *stable*.

In [52]:
features = ['latitude',
            'longitude',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365',
            'price']

def train_test_split(df,test_split = 0.2, val_split = 0.2, random_seed = 42):

    n = len(df)
    idx = np.arange(n)
    np.random.seed = random_seed
    np.random.shuffle(idx)

    n = len(df)
    n_val = int(n * val_split)
    n_test = int(n * test_split)
    n_train = n - n_val - n_test

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train + n_val]] 
    df_test = df.iloc[idx[n_train+n_val:]]

    df_train = df_train.reset_index(drop = True)
    df_val = df_val.reset_index(drop = True)
    df_test = df_test.reset_index(drop = True)

    y_train = np.log1p(df_train['price'].values)
    y_val = np.log1p(df_val['price'].values)
    y_test = np.log1p(df_test['price'].values)

    df_train.drop('price', axis=1, inplace=True)
    df_val.drop('price', axis=1, inplace=True)
    df_test.drop('price', axis=1, inplace=True)
    
    return df_train, df_val, df_test, y_train, y_val, y_test

df_train, df_val, df_test, y_train, y_val, y_test= train_test_split(df[features])

In [57]:
score = []
for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    df_train, df_val, df_test, y_train, y_val, y_test= train_test_split(df[features], random_seed = s)
    
    # Model and Prediction |  missing value imputation using 0"
    X_train = imputing_0(df_train)

    w0,w = linear_regression_reg(X_train,y_train,r=0)

    # Predicting on Validation set
    X_val = imputing_0(df_val)
    y_pred = w0 + X_val.dot(w)
    
    # RMSE Scores
    score.append(rmse(y_val, y_pred).round(2))
    print(f"seed: {s}, w0: {w0}, score: {score}")

print(f"STD - scores: {np.std(score):.3f}")

seed: 0, w0: -435.00359486219253, score: [0.64]
seed: 1, w0: -418.20048889715684, score: [0.64, 0.64]
seed: 2, w0: -418.79893520508693, score: [0.64, 0.64, 0.65]
seed: 3, w0: -415.2825734255888, score: [0.64, 0.64, 0.65, 0.64]
seed: 4, w0: -413.5666689892554, score: [0.64, 0.64, 0.65, 0.64, 0.66]
seed: 5, w0: -425.4217424699481, score: [0.64, 0.64, 0.65, 0.64, 0.66, 0.65]
seed: 6, w0: -425.6221976254481, score: [0.64, 0.64, 0.65, 0.64, 0.66, 0.65, 0.64]
seed: 7, w0: -423.1139161623612, score: [0.64, 0.64, 0.65, 0.64, 0.66, 0.65, 0.64, 0.63]
seed: 8, w0: -427.9800170025963, score: [0.64, 0.64, 0.65, 0.64, 0.66, 0.65, 0.64, 0.63, 0.64]
seed: 9, w0: -432.04981113061024, score: [0.64, 0.64, 0.65, 0.64, 0.66, 0.65, 0.64, 0.63, 0.64, 0.64]
STD - scores: 0.008


## **Question 6**

* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`. 
* What's the RMSE on the test dataset?

In [58]:
df_train, df_val, df_test, y_train, y_val, y_test= train_test_split(df[features], random_seed = 9)

# Combine train and validation datasets
df_train_full = pd.concat([df_train, df_val])
df_train_full = df_train_full.reset_index(drop = True)

# Fill the missing values with 0 and train a model with `r=0.001`
X_train_full = imputing_0(df_train_full)
y_full_train = np.concatenate([y_train, y_val])

w0,w = linear_regression_reg(X_train,y_train, r = 0.001)

# Predict / test
X_test = imputing_0(df_test)
y_pred = w0 + X_test.dot(w)

#     RMSE Scores
score =rmse(y_test, y_pred)
print(f"seed: {s}, w0: {w0}, score: {score}")

seed: 9, w0: 13.466070934018283, score: 0.6974260793874817
