In [1]:
# Dataset

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

In [3]:
import pandas as pd
df = pd.read_csv('laptops.csv')

In [4]:
# Preparing the dataset

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df[["ram", "storage", "screen", "final_price"]]

In [6]:
# Question 1

In [7]:
df.columns[df.isnull().sum() > 0].tolist()

['screen']

In [8]:
# Question 2

In [9]:
int(df.describe()["ram"].loc["50%"])

16

In [10]:
int(df["ram"].median())

16

In [11]:
# Prepare and split the dataset

In [12]:
import numpy as np

In [13]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.final_price.values
y_val =df_val.final_price.values
y_test = df_test.final_price.values

del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [14]:
# Question 3

In [15]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]


def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [16]:
rows_with_missing_values = df_train[df_train.isnull().any(axis=1)]
print(rows_with_missing_values)
indexes_of_rows_with_missing_values = rows_with_missing_values.index.tolist()

      ram  storage  screen
124    16      512     NaN
1062   16      512     NaN
1238    8      256     NaN


In [17]:
# filling missing values with 0

def prepare_X(df):
    df_num = df.copy()
    return df_num.fillna(0).values

X_train = prepare_X(df_train)

print("Rows with missing values after filling them:")
print(X_train[indexes_of_rows_with_missing_values])

w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

rmse_with_zero = round(rmse(y_val, y_pred), 2).item()
print(f"\nRMSE: {rmse_with_zero}")

Rows with missing values after filling them:
[[ 16. 512.   0.]
 [ 16. 512.   0.]
 [  8. 256.   0.]]

RMSE: 597.36


In [18]:
# filling missing values with mean

df_train_screen_mean = df_train["screen"].mean()

def prepare_X(df):
    df_num = df.copy()
    return df_num.fillna(df_train_screen_mean).values

X_train = prepare_X(df_train)

print("Rows with missing values after filling them:")
print(X_train[indexes_of_rows_with_missing_values])

w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

rmse_with_mean = round(rmse(y_val, y_pred), 2).item()
print(f"\nRMSE: {rmse_with_mean}")

Rows with missing values after filling them:
[[ 16.         512.          15.16353442]
 [ 16.         512.          15.16353442]
 [  8.         256.          15.16353442]]

RMSE: 600.27


In [19]:
print(
    "With 0"
    if rmse_with_zero < rmse_with_mean
    else "With mean"
    if rmse_with_zero > rmse_with_mean
    else "Both are equally good"
)

With 0


In [20]:
# Question 4

In [21]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def prepare_X(df):
    df_num = df.copy()
    return df_num.fillna(0).values

In [22]:
rs = [0, 0.01, 1, 10, 100]
scores = []

for r in rs:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    scores.append(rmse(y_val, y_pred))

print(round(pd.DataFrame({"r": rs, "score": scores}), 2))

        r   score
0    0.00  597.36
1    0.01  597.36
2    1.00  597.21
3   10.00  597.06
4  100.00  597.90


In [23]:
r_scores = pd.DataFrame({"r": rs, "score": scores})
r_scores.loc[r_scores["score"].idxmin()]

r         10.000000
score    597.058768
Name: 3, dtype: float64

In [24]:
# Question 5

In [25]:
def split_data(df, seed):
    n = len(df)

    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = df_train.final_price.values
    y_val = df_val.final_price.values
    y_test = df_test.final_price.values
    
    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']

    return df_train, df_val, df_test, y_train, y_val, y_test

In [26]:
def prepare_X(df):
    df_num = df.copy()
    return df_num.fillna(0).values

In [27]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []

for seed in seeds:
    df_train, df_val, df_test, y_train, y_val, y_test = split_data(df, seed)
    
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    scores.append(rmse(y_val, y_pred))

print(round(pd.DataFrame({"seed": seeds, "score": scores}), 2))

   seed   score
0     0  565.45
1     1  636.80
2     2  588.96
3     3  597.81
4     4  571.96
5     5  573.24
6     6  647.34
7     7  550.44
8     8  587.33
9     9  576.10


In [28]:
round(np.std(scores), 3).item()

29.176

In [29]:
# Question 6

In [30]:
def prepare_X(df):
    df_num = df.copy()
    return df_num.fillna(0).values

In [31]:
df_train, df_val, df_test, y_train, y_val, y_test = split_data(df, 9)
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = prepare_X(df_full_train)
y_full_train = np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score.item()

608.609982204956