### Load the training and test data

In [1]:
import os.path as path
import pandas as pd
import numpy as np

train_data_path = path.join(path.pardir, 'new_train.csv')
data = pd.read_csv(train_data_path)
X_train = data.drop(['min_price', 'max_price'], axis = 1)
y_train = data[['min_price', 'max_price']]

test_data_path = path.join(path.pardir, 'new_test.csv')
data = pd.read_csv(test_data_path)
X_test = data.drop(['min_price', 'max_price'], axis = 1)
y_test = data[['min_price', 'max_price']]

X_train.head()

Unnamed: 0.1,Unnamed: 0,id,name,brand,base_name,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,...,cpu_details,detachable_keyboard,discrete_gpu,gpu,os,os_details,ram,ssd,storage,weight
0,174,30511,Dell Inspiron 15 5593: 10th Gen Core i5-1035G1...,Dell,Dell Inspiron 5593,15.6,1920,1080,Matte,0,...,Intel Core i5-1035G1 1.0 GHz (10th gen Ice Lak...,0.0,0,Intel UHD,Windows,Windows 10 Home,8,256,256,4.52
1,231,22136,ASUS ROG Zephyrus GX501 Ultra Slim Gaming Lapt...,Asus,Asus ROG Zephyrus GX501,15.6,1920,1080,Matte,0,...,Intel Core i7-8750H 2.2 GHz (8th gen Coffee La...,0.0,1,NVIDIA GeForce GTX 1080,Windows,Windows 10 Pro,16,512,512,5.0
2,439,26958,Apple MacBook Air MJVE2LL/A 13-inch Laptop 1.6...,Apple,Apple MacBook Air 2015,13.3,1440,900,Glossy,0,...,Intel Core i5-5250U 1.6 GHz (5th gen Broadwell...,0.0,0,Intel HD 6000,macOS,OS X El Capitan,8,128,128,2.96
3,262,8282,"ASUS ZenBook UX305CA-EHM1 Laptop (Windows 10, ...",Asus,Asus ZenBook UX305CA-EHM1,13.3,1920,1080,Matte,0,...,Intel Core m3-6Y30 0.9 GHz (6th gen Skylake Du...,0.0,0,Intel HD,Windows,Windows 10,8,256,256,2.65
4,307,29844,"Lenovo 100E Chromebook 2ND Gen Laptop, 11.6"" H...",Lenovo,Lenovo 100e Chromebook (2nd Generation),11.6,1366,768,Matte,0,...,MediaTek MT8173C 2.1 GHz (Quad-Core),0.0,0,Imagination PowerVR GX6250,Chrome OS,Chrome OS,4,16,16,2.68


### Feature engineering pipeline

#### Add `screen_ratio` column

In [2]:
def add_screen_ratio_column(df):
    df['screen_ratio'] = df['pixels_x'] / df['pixels_y']

df = X_train.copy()
add_screen_ratio_column(df)
df['screen_ratio'].head()

0    1.777778
1    1.777778
2    1.600000
3    1.777778
4    1.778646
Name: screen_ratio, dtype: float64

#### Add `clock_speed` column

In [3]:
def add_clock_speed_column(df):
    df['clock_speed'] = df['cpu_details'].str.extract('(\d+.\d+) GHz')
    df['clock_speed'] = pd.to_numeric(df['clock_speed'])

df = X_train.copy()
add_clock_speed_column(df)
df['clock_speed'].head()

0    1.0
1    2.2
2    1.6
3    0.9
4    2.1
Name: clock_speed, dtype: float64

#### Add `cpu_gen` column

In [4]:
def add_cpu_gen_column(df):
    df['cpu_gen'] = df['cpu_details'].str.extract('(\d+)th gen')
    df['cpu_gen'] = pd.to_numeric(df['cpu_gen'])

df = X_train.copy()
print(df.isnull().values.sum())
add_cpu_gen_column(df)
print(df.isnull().values.sum())
df['cpu_gen'].head()

25
136


0    10.0
1     8.0
2     5.0
3     6.0
4     NaN
Name: cpu_gen, dtype: float64

####  Add `glossy_screen` column 

In [5]:
def add_glossy_screen_column(df):
    surface = df['screen_surface'].str.lower()
    surface = surface.replace({'' : np.nan, 'glossy' : 1, 'matte' : 0})
    df['glossy_screen'] = surface

df = X_train.copy()
print(df.isnull().values.sum())
add_glossy_screen_column(df)
print(df.isnull().values.sum())
df['glossy_screen'].head()

25
35


0    0.0
1    0.0
2    1.0
3    0.0
4    0.0
Name: glossy_screen, dtype: float64

#### Add `gpu_brand` column

In [6]:
def add_gpu_brand_column(df):
    df['gpu_brand'] = df['gpu'].str.split().str.get(0)
    df['gpu_brand'] = df['gpu_brand'].replace({'Imagination': 'PowerVR'})

df = X_train.copy()
print(df.isnull().values.sum())
add_gpu_brand_column(df)
print(df.isnull().values.sum())
df['gpu_brand'].head()

25
26


0      Intel
1     NVIDIA
2      Intel
3      Intel
4    PowerVR
Name: gpu_brand, dtype: object

#### Add `hdd` column

In [7]:
def add_hdd_column(df):
    df['hdd'] = df['storage'] - df['ssd']
    df['hdd'] = pd.to_numeric(df['hdd'])

df = X_train.copy()
add_hdd_column(df)
df['hdd'].head()

0    0
1    0
2    0
3    0
4    0
Name: hdd, dtype: int64

#### Keep only feature columns

In [8]:
boolean_features = ['glossy_screen', 'touchscreen', 'detachable_keyboard', 'discrete_gpu']
categorical_features = ['brand', 'cpu', 'cpu_gen', 'gpu_brand', 'os_details']
numerical_features = ['screen_size', 'pixels_x', 'screen_ratio', 'clock_speed', 'ram', 'ssd', 'hdd', 'weight']
all_features = boolean_features + categorical_features + numerical_features

def keep_only_feature_columns(df):
    return df[all_features].replace({'': np.nan})
    

#### Define pipeline as a function

In [9]:
def engineer_features(df):
    add_screen_ratio_column(df)
    add_clock_speed_column(df)
    add_cpu_gen_column(df)
    add_glossy_screen_column(df)
    add_gpu_brand_column(df)
    add_hdd_column(df)
    return keep_only_feature_columns(df)
    

#### Run the pipeline on the training data

In [10]:
X_train = engineer_features(X_train)
X_train.head()

Unnamed: 0,glossy_screen,touchscreen,detachable_keyboard,discrete_gpu,brand,cpu,cpu_gen,gpu_brand,os_details,screen_size,pixels_x,screen_ratio,clock_speed,ram,ssd,hdd,weight
0,0.0,0,0.0,0,Dell,Intel Core i5,10.0,Intel,Windows 10 Home,15.6,1920,1.777778,1.0,8,256,0,4.52
1,0.0,0,0.0,1,Asus,Intel Core i7,8.0,NVIDIA,Windows 10 Pro,15.6,1920,1.777778,2.2,16,512,0,5.0
2,1.0,0,0.0,0,Apple,Intel Core i5,5.0,Intel,OS X El Capitan,13.3,1440,1.6,1.6,8,128,0,2.96
3,0.0,0,0.0,0,Asus,Intel Core m3,6.0,Intel,Windows 10,13.3,1920,1.777778,0.9,8,256,0,2.65
4,0.0,0,0.0,0,Lenovo,MediaTek,,PowerVR,Chrome OS,11.6,1366,1.778646,2.1,4,16,0,2.68


### Prepare data for training

Now that we have defined the features we will use, we need to encode our categorical data using dummy encoding and also deal with missing values in our data. Here, we will need to define our transformations as a scikit-learn pipeline, so that we can fit it to the data while training (and have it learn eg. the mean value that we will use for imputing missing data) and reuse it during testing and for our actual predictions.

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# one_hot_encoder = OneHotEncoder(categories = *Not "auto"*, drop = 'first', handle_unknown = 'error')
# To use the above setting, we must first gather all the possible values for our categorical features
one_hot_encoder = OneHotEncoder(categories = 'auto', drop = None, handle_unknown = 'ignore')
most_frequent_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent', 
                                             add_indicator = True)
median_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median', 
                                      add_indicator = True)

numeric_transformer = Pipeline(steps=[
    ('imputer', median_imputer),
#    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', most_frequent_imputer),
    ('onehot', one_hot_encoder)
])

boolean_transformer = Pipeline(steps=[
    ('imputer', most_frequent_imputer)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features)
    ])

### Train a classifier

In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.multioutput import MultiOutputRegressor
# # Note that random forests etc. don't need this MultiOutputRegressor

# base_clf = LogisticRegression()

# Logistic regression was giving me some trouble with the labels (ValueError: Unknown label type: 'continuous')

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 200, criterion = "mae", n_jobs = -1)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', regressor)])

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                              

## Evaluation

### Prepare the test data

In [13]:
X_test = engineer_features(X_test)
X_test.head()

Unnamed: 0,glossy_screen,touchscreen,detachable_keyboard,discrete_gpu,brand,cpu,cpu_gen,gpu_brand,os_details,screen_size,pixels_x,screen_ratio,clock_speed,ram,ssd,hdd,weight
0,1.0,1,0.0,0,Microsoft,Intel Core i5,7.0,Intel,Windows 10 S,13.5,2256,1.5,2.5,8,256,0,2.76
1,1.0,1,0.0,0,Dell,AMD A6,7.0,AMD,Windows 10 Home,11.6,1366,1.778646,1.6,4,32,0,3.41
2,1.0,1,1.0,1,Microsoft,Intel Core i7,8.0,NVIDIA,Windows 10 Pro,13.5,3000,1.5,1.9,16,512,0,3.62
3,1.0,1,0.0,1,Razer,Intel Core i7,8.0,NVIDIA,Windows 10 Home,13.3,3840,1.777778,1.8,16,512,0,3.04
4,1.0,0,,0,Apple,Intel Core i5,5.0,Intel,OS X Yosemite,11.6,1366,1.778646,1.6,4,128,0,2.38


### Evalute the classifier

In [17]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, clf.predict(X_test))

152.42513615196071