In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [47]:
df = pd.read_csv('laptops.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


In [48]:
# Check which column has missing values
missing_columns = df.isnull().sum()

# Print columns with missing values
print(missing_columns[missing_columns > 0])

storage_type      42
gpu             1371
screen             4
dtype: int64


In [49]:
# Calculate the median of the 'ram' column
ram_median = df['ram'].median()

# Print the median value
print("Median of RAM column:", ram_median)


Median of RAM column: 16.0


In [160]:
#np.random.seed(42)
np.random.seed(9)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
# Combine train and validation data
df_combined = pd.concat([df_train, df_val], axis=0)

# Optionally, reset the index if needed
df_combined.reset_index(drop=True, inplace=True)

# Check the result
#print(df_combined.head())
print("Size of combined dataset:", df_combined.shape)
df_combined.shape

df_test = df_shuffled.iloc[n_train+n_val:].copy()



Size of combined dataset: (1728, 12)


Use logarthmic finction for prices since they can varya great deal

In [161]:
#y_train_orig = df_train.final_price.values
#y_val_orig = df_val.final_price.values
#y_test_orig = df_test.final_price.values
#y_combined_orig = df_combined.final_price.values

y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values
y_combined = df_combined.final_price.values

#y_train = np.log1p(df_train.final_price.values)
#y_combined = np.log1p(df_combined.final_price.values)
#y_val = np.log1p(df_val.final_price.values)
#y_test = np.log1p(df_test.final_price.values)

del df_train['final_price']
del df_val['final_price']
del df_test['final_price']
del df_combined['final_price']

Linear Regression

In [162]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

Baseline solution

In [163]:
base = ['ram', 'storage', 'screen']

In [164]:
screen_median = df['screen'].median()
print(screen_median)

15.6


In [165]:
def prepare_X(df):
    df_num = df[base]
    screen_median = df['screen'].median()
    #print(screen_median)
    df_num = df_num.fillna(0)
    #df_num = df_num.fillna(screen_median)
    X = df_num.values
    return X

In [166]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)
y_pred = w_0 + X_train.dot(w)

In [167]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [168]:
score = rmse(y_train, y_pred)
score = round(score, 2)
score

591.38

Validation 

In [169]:
X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
score = rmse(y_val, y_pred)
score = round(score, 2)
score

576.1

Regularization

In [170]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [171]:
X_combined = prepare_X(df_combined)
X_test = prepare_X(df_test)

#for r in [0, 0.01, 0.1, 1, 5, 10, 1000]:
for r in [0.01]:    
    w_0, w = train_linear_regression_reg(X_combined, y_combined, r=r)
    y_pred = w_0 + X_test.dot(w)
    score = rmse(y_test, y_pred)
    score = round(score, 2)
    score
    print('%6s' %r, score)

  0.01 608.61
