In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv')

In [3]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [4]:
df.shape

(2160, 12)

In [5]:
# normalize the names of columns

df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
# Select the desired columns
df = df[['ram', 'storage', 'screen', 'final_price']]

#### Question 1
There's one column with missing values. What is it?

'ram' 


'storage'


'screen' --> 4 values are missing from 'screen' column


'final_price'

In [7]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

#### Question 2
What's the median (50% percentile) for variable 'ram'?


Answer: 16GB

In [8]:
df['ram'].median()

np.float64(16.0)

#### Prepare and split the dataset

- Shuffle the dataset (the filtered one you created above), use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [9]:
n = len(df)
idx= np.arange(n)

In [11]:
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [10]:
np.random.seed(42)
np.random.shuffle(idx)

In [18]:
n_train, n_val, n_test

(1296, 432, 432)

In [24]:
# Taking the first n_train observation for df_train
df_train = df.iloc[idx[:n_train]].reset_index(drop=True)

df_val = df.iloc[idx[n_train:n_train + n_val]].reset_index(drop=True)

df_test = df.iloc[idx[n_train + n_val:]].reset_index(drop=True)

In [19]:
df_train.shape[0], df_val.shape[0], df_test.shape[0]

(1296, 432, 432)

In [23]:
df_train.head()

Unnamed: 0,ram,storage,screen,final_price
0,32,1000,15.6,1123.29
1,4,64,14.1,201.05
2,32,1000,14.0,997.74
3,16,512,13.3,1016.0
4,32,1000,16.0,2739.0


In [25]:
# Separate target variable from the df 

y_train = df_train['final_price']
y_val = df_val['final_price']
y_test = df_test['final_price']

In [26]:
del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [27]:
df_train.head()

Unnamed: 0,ram,storage,screen
0,32,1000,15.6
1,4,64,14.1
2,32,1000,14.0
3,16,512,13.3
4,32,1000,16.0


#### Question 3
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?

In [48]:
df_train.isnull().sum()

ram        0
storage    0
screen     3
dtype: int64

The 'screen' columns from training data set containing missing values so we will need to deal with the missing values. 
- Two options are:
- 1. Fill the missing with 0
  2. fill the missing with the mean of screen.

In [32]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [45]:
def prepare_X(df, fillna_value):
    df = df.fillna(fillna_value)
    X = df.values
    return X

In [33]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

##### *Option 1*: Fill the missing value with 0

In [29]:
df_train_o1 = df_train.copy()
df_train_o1 = df_train_o1.fillna(0)

In [46]:
X_train_o1 = prepare_X(df_train, fillna_value = 0)

In [47]:
w0, w = train_linear_regression(X_train_o1, y_train)

X_val = df_val.values
y_pred = w0 + X_val.dot(w)

float(round(rmse(y_val, y_pred),2))

596.96

##### *Option 2*: Fill the missing value with the mean

In [49]:
mean_value = df['screen'].mean()
X_train_o2 = prepare_X(df_train, fillna_value = mean_value)

In [50]:
w0, w = train_linear_regression(X_train_o2, y_train)

X_val = df_val.values
y_pred = w0 + X_val.dot(w)

float(round(rmse(y_val, y_pred),2))

597.71

RMSE of option 1 is slightly lower so it is a better method to use.

#### Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

In [44]:
def train_linear_regression_reg(X, y, r=0.01):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [51]:
X_train = prepare_X(df_train, fillna_value=0)
X_val = prepare_X(df_val, fillna_value = 0)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w0 + X_val.dot(w)
    print(r, float(round(rmse(y_val, y_pred),2)))

0 597.36
0.01 597.36
0.1 597.35
1 597.21
5 597.01
10 597.06
100 597.9


r=5 has the lowest RMSE score so it is the best value to use.

#### Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [55]:
rmses = []

for seed in range(10):
    n = len(df)
    idx= np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]

    df_train = df_shuffled[:n_train].copy().reset_index(drop=True)
    df_val = df.iloc[idx[n_train:n_train + n_val]].copy().reset_index(drop=True)
    df_test = df.iloc[idx[n_train + n_val:]].copy().reset_index(drop=True)

    y_train = df_train['final_price']
    y_val = df_val['final_price']
    y_test = df_test['final_price']

    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']

    X_train = prepare_X(df_train, fillna_value=0)
    X_val = prepare_X(df_val, fillna_value = 0)
    
    w0, w = train_linear_regression(X_train, y_train)
    y_pred = w0 + X_val.dot(w)

    result = rmse(y_val, y_pred)
    print(seed, float(round(result,2)))

    rmses.append(result)
    

0 565.45
1 636.8
2 588.96
3 597.81
4 571.96
5 573.24
6 647.34
7 550.44
8 587.33
9 576.1


In [60]:
# Calculate the standard deviation of all the RMSE score
float(round(np.std(rmses),3))

29.176

#### Question 6
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?

In [65]:
n = len(df)
idx= np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

df_train = df_shuffled[:n_train].copy().reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train + n_val]].copy().reset_index(drop=True)
df_test = df.iloc[idx[n_train + n_val:]].copy().reset_index(drop=True)

y_train = df_train['final_price']
y_val = df_val['final_price']
y_test = df_test['final_price']

del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [66]:
df_train_full = pd.concat([df_train, df_val])

In [67]:
# Combine the y_train and y_val 
y_train_full = np.concatenate([y_train, y_val])

In [68]:
X_train_full = prepare_X(df_train_full, fillna_value=0)
w0, w = train_linear_regression_reg(X_train_full, y_train_full, r=0.001)

X_test = prepare_X(df_test, fillna_value=0)
y_pred = w0 + X_test.dot(w)
rmse(y_test, y_pred)

np.float64(608.6099822049601)