### Load the training data

In [1]:
import os.path as path
import pandas as pd
import numpy as np

def split_features_targets(data):
    x = data.drop(['min_price', 'max_price'], axis = 1)
    y = data[['min_price', 'max_price']]
    return x,y

train_data_path = path.join(path.pardir, 'train.csv')
data = pd.read_csv(train_data_path)
X_train, y_train = split_features_targets(data)

X_train.head()

Unnamed: 0,id,name,brand,base_name,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,cpu,cpu_details,detachable_keyboard,discrete_gpu,gpu,os,os_details,ram,ssd,storage,weight
0,7774,Lenovo Flex 3 15.6-Inch Touchscreen Laptop (Co...,Lenovo,Lenovo Flex 3 80JM002CUS,15.6,1920,1080,Glossy,1,Intel Core i7,Intel Core i7-5500U 2.40 GHz (5th gen Broadwel...,0.0,0,Intel HD,Windows,Windows 10,8,0,1000,4.6
1,25926,Razer Blade 15 Gaming Laptop: Intel Core i7-87...,Razer,Razer Blade 15,15.6,1920,1080,Matte,0,Intel Core i7,Intel Core i7-8750H 2.2 GHz (8th gen Coffee La...,0.0,1,NVIDIA GeForce RTX 2070 Max-Q,Windows,Windows 10 Home,16,512,512,4.63
2,25267,Hp 15.6 Inch HD Thin and Light Laptop ( 7th Ge...,HP,HP,15.6,1366,768,,0,AMD A6,AMD A6-9220 2.5 GHz (7th gen Stoney Ridge Dual...,0.0,0,AMD Radeon R4,Windows,Windows 10,8,0,500,4.63
3,22367,"Acer Aspire E 15, 15.6"" Full HD, 8th Gen Intel...",Acer,Acer Aspire E5-576,15.6,1920,1080,Matte,0,Intel Core i3,Intel Core i3-8130U 2.2 GHz (8th gen Kaby Lake...,0.0,0,Intel UHD 620,Windows,Windows 10 Home,6,0,1000,5.3
4,17471,"HP 17.3 inch (1600 x 900) HD+ Laptop PC, Intel...",HP,HP 17,17.3,1600,900,Glossy,0,Intel Core i5,Intel Core i5-7200U 2.5 GHz (7th gen Kaby Lake...,0.0,0,Intel HD 620,Windows,Windows 10,8,0,1000,5.8


### Feature engineering pipeline

#### Add `screen_ratio` column

In [2]:
def add_screen_ratio_column(df):
    df['screen_ratio'] = df['pixels_x'] / df['pixels_y']

df = X_train.copy()
add_screen_ratio_column(df)
df['screen_ratio'].head()

0    1.777778
1    1.777778
2    1.778646
3    1.777778
4    1.777778
Name: screen_ratio, dtype: float64

#### Add `clock_speed` column

In [3]:
def add_clock_speed_column(df):
    df['clock_speed'] = df['cpu_details'].str.extract('(\d+.\d+) GHz')
    df['clock_speed'] = pd.to_numeric(df['clock_speed'])

df = X_train.copy()
add_clock_speed_column(df)
df['clock_speed'].head()

0    2.4
1    2.2
2    2.5
3    2.2
4    2.5
Name: clock_speed, dtype: float64

#### Add `cpu_gen` column

In [4]:
def add_cpu_gen_column(df):
    df['cpu_gen'] = df['cpu_details'].str.extract('(\d+)th gen')
    df['cpu_gen'] = pd.to_numeric(df['cpu_gen'])

df = X_train.copy()
print(df.isnull().values.sum())
add_cpu_gen_column(df)
print(df.isnull().values.sum())
df['cpu_gen'].head()

32
170


0    5.0
1    8.0
2    7.0
3    8.0
4    7.0
Name: cpu_gen, dtype: float64

####  Add `glossy_screen` column 

In [5]:
def add_glossy_screen_column(df):
    surface = df['screen_surface'].str.lower()
    surface = surface.replace({'' : np.nan, 'glossy' : 1, 'matte' : 0})
    df['glossy_screen'] = surface

df = X_train.copy()
print(df.isnull().values.sum())
add_glossy_screen_column(df)
print(df.isnull().values.sum())
df['glossy_screen'].head()

32
44


0    1.0
1    0.0
2    NaN
3    0.0
4    1.0
Name: glossy_screen, dtype: float64

#### Add `gpu_brand` column

In [6]:
def add_gpu_brand_column(df):
    df['gpu_brand'] = df['gpu'].str.split().str.get(0)
    df['gpu_brand'] = df['gpu_brand'].replace({'Imagination': 'PowerVR'})

df = X_train.copy()
print(df.isnull().values.sum())
add_gpu_brand_column(df)
print(df.isnull().values.sum())
df['gpu_brand'].head()

32
34


0     Intel
1    NVIDIA
2       AMD
3     Intel
4     Intel
Name: gpu_brand, dtype: object

#### Add `hdd` column

In [7]:
def add_hdd_column(df):
    df['hdd'] = df['storage'] - df['ssd']
    df['hdd'] = pd.to_numeric(df['hdd'])

df = X_train.copy()
add_hdd_column(df)
df['hdd'].head()

0    1000
1       0
2     500
3    1000
4    1000
Name: hdd, dtype: int64

#### Keep only feature columns

In [8]:
boolean_features = ['glossy_screen', 'touchscreen', 'detachable_keyboard', 'discrete_gpu']
categorical_features = ['brand', 'cpu', 'gpu_brand', 'os_details']
numerical_features = ['screen_size', 'pixels_x', 'screen_ratio', 'clock_speed', 'cpu_gen',
                      'ram', 'ssd', 'hdd', 'weight']
all_features = boolean_features + categorical_features + numerical_features

def keep_only_feature_columns(df):
    return df[all_features].replace({'': np.nan})
    

#### Define pipeline as a function

In [9]:
def engineer_features(df):
    add_screen_ratio_column(df)
    add_clock_speed_column(df)
    add_cpu_gen_column(df)
    add_glossy_screen_column(df)
    add_gpu_brand_column(df)
    add_hdd_column(df)
    return keep_only_feature_columns(df)
    

#### Run the pipeline on the training data

In [10]:
X_train = engineer_features(X_train)
X_train.head()

Unnamed: 0,glossy_screen,touchscreen,detachable_keyboard,discrete_gpu,brand,cpu,gpu_brand,os_details,screen_size,pixels_x,screen_ratio,clock_speed,cpu_gen,ram,ssd,hdd,weight
0,1.0,1,0.0,0,Lenovo,Intel Core i7,Intel,Windows 10,15.6,1920,1.777778,2.4,5.0,8,0,1000,4.6
1,0.0,0,0.0,1,Razer,Intel Core i7,NVIDIA,Windows 10 Home,15.6,1920,1.777778,2.2,8.0,16,512,0,4.63
2,,0,0.0,0,HP,AMD A6,AMD,Windows 10,15.6,1366,1.778646,2.5,7.0,8,0,500,4.63
3,0.0,0,0.0,0,Acer,Intel Core i3,Intel,Windows 10 Home,15.6,1920,1.777778,2.2,8.0,6,0,1000,5.3
4,1.0,0,0.0,0,HP,Intel Core i5,Intel,Windows 10,17.3,1600,1.777778,2.5,7.0,8,0,1000,5.8


### Prepare data for training

Now that we have defined the features we will use, we need to encode our categorical data using dummy encoding and also deal with missing values in our data. Here, we will need to define our transformations as a scikit-learn pipeline, so that we can fit it to the data while training (and have it learn eg. the mean value that we will use for imputing missing data) and reuse it during testing and for our actual predictions.

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, KNNImputer #, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# one_hot_encoder = OneHotEncoder(categories = *Not "auto"*, drop = 'first', handle_unknown = 'error')
# To use the above setting, we must first gather all the possible values for our categorical features
one_hot_encoder = OneHotEncoder(categories = 'auto', drop = None, handle_unknown = 'ignore')
most_frequent_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent', 
                                             add_indicator = True)
# median_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median', 
#                                       add_indicator = True)
# iterative_imputer = IterativeImputer(missing_values = np.nan, add_indicator = True)
knn_imputer = KNNImputer(missing_values = np.nan, add_indicator = True)

numeric_transformer = Pipeline(steps=[
    ('imputer', knn_imputer),
#     ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', most_frequent_imputer),
    ('onehot', one_hot_encoder)
])

boolean_transformer = Pipeline(steps=[
    ('imputer', most_frequent_imputer)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features)
    ])

### Train a classifier

In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.multioutput import MultiOutputRegressor
# # Note that random forests etc. don't need this MultiOutputRegressor

# base_clf = LogisticRegression()

# Logistic regression was giving me some trouble with the labels (ValueError: Unknown label type: 'continuous')

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

# regressor = RandomForestRegressor(n_estimators = 200, criterion = "mae", n_jobs = -1)
base_regressor = GradientBoostingRegressor(n_estimators = 100, criterion = "mae", max_depth = 6)
regressor = MultiOutputRegressor(base_regressor, n_jobs = -1)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', regressor)])

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   KNNImputer(add_indicator=True,
                                                                              copy=True,
                                                                              metric='nan_euclidean',
                                                                              missing_values=nan,
                                                                              n_neighbors=5,
                                                                          

## Prediction time!

If we are satisfied by the results of our evaluation, we are ready to make some predictions on the unknown targets.

### Load the prediction data

In [13]:
prediction_data_path = path.join(path.pardir, 'test.csv')
data = pd.read_csv(prediction_data_path)
data.head()

Unnamed: 0,id,name,brand,base_name,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,cpu,cpu_details,detachable_keyboard,discrete_gpu,gpu,os,os_details,ram,ssd,storage,weight
0,28807,ASUS ZenBook Flip S Touchscreen Convertible La...,Asus,Asus ZenBook Flip S UX370UA,13.3,1920.0,1080.0,Glossy,1,Intel Core i7,Intel Core i7-8550U 1.8 GHz (8th gen Kaby Lake...,0,0,Intel UHD 620,Windows,Windows 10 Pro,16,512,512,2.42
1,22559,Dell Inspiron 15 Intel Core i3-7130U 8GB 1TB H...,Dell,Dell Inspiron 3567,15.6,1366.0,768.0,Matte,0,Intel Core i3,Intel Core i3-7130U 2.7 GHz (7th gen Kaby Lake...,0,0,Intel HD 620,Windows,Windows 10 Home,8,0,1000,4.95
2,28647,Asus Vivobook S15 S512 Thin and Light 15.6â€ ...,Asus,Asus VivoBook S512FA,15.6,1920.0,1080.0,Matte,0,Intel Core i7,Intel Core i7-8565U 1.8 GHz (8th gen Whiskey L...,0,0,Intel UHD 620,Windows,Windows 10 Home,8,256,1256,4.0
3,22141,"HP 15.6"" HD Touchscreen Laptop PC, Intel Core ...",HP,HP,15.6,1366.0,768.0,Glossy,1,Intel Core i5,Intel Core i5-7200U 2.5 GHz (7th gen Kaby Lake...,0,0,Intel HD 620,Windows,Windows 10 Home,8,128,2128,4.52
4,26116,"MSI GS75 Stealth-093 17.3"" Razor Thin Bezel Ga...",MSI,MSI GS75 Stealth,17.3,1920.0,1080.0,Matte,0,Intel Core i7,Intel Core i7-8750H 2.2 GHz (8th gen Coffee La...,0,1,NVIDIA GeForce RTX 2080 Max-Q,Windows,Windows 10 Home,32,512,512,4.96


### Prepare the prediction data

In [14]:
X_pred = engineer_features(data)
X_pred.head()

Unnamed: 0,glossy_screen,touchscreen,detachable_keyboard,discrete_gpu,brand,cpu,gpu_brand,os_details,screen_size,pixels_x,screen_ratio,clock_speed,cpu_gen,ram,ssd,hdd,weight
0,1.0,1,0,0,Asus,Intel Core i7,Intel,Windows 10 Pro,13.3,1920.0,1.777778,1.8,8.0,16,512,0,2.42
1,0.0,0,0,0,Dell,Intel Core i3,Intel,Windows 10 Home,15.6,1366.0,1.778646,2.7,7.0,8,0,1000,4.95
2,0.0,0,0,0,Asus,Intel Core i7,Intel,Windows 10 Home,15.6,1920.0,1.777778,1.8,8.0,8,256,1000,4.0
3,1.0,1,0,0,HP,Intel Core i5,Intel,Windows 10 Home,15.6,1366.0,1.778646,2.5,7.0,8,128,2000,4.52
4,0.0,0,0,1,MSI,Intel Core i7,NVIDIA,Windows 10 Home,17.3,1920.0,1.777778,2.2,8.0,32,512,0,4.96


### Predict targets

In [15]:
y_pred = clf.predict(X_pred)
y_pred = pd.DataFrame({'MIN': y_pred[:, 0], 'MAX': y_pred[:, 1]})
y_pred.head()

Unnamed: 0,MIN,MAX
0,1059.011824,1219.116489
1,350.045737,371.562607
2,804.5261,790.021957
3,534.080469,583.093171
4,1389.70746,1587.241108


### 'Fix" values

* No min value should be larger than the corresponding max value.
* We should round the result to the first or second decimal point.

In [16]:
y_pred['MIN'] = y_pred['MIN'].where(y_pred['MIN'] <= y_pred['MAX'], y_pred['MAX'])
y_pred = y_pred.round(decimals = 2)
y_pred.head()

Unnamed: 0,MIN,MAX
0,1059.01,1219.12
1,350.05,371.56
2,790.02,790.02
3,534.08,583.09
4,1389.71,1587.24


#### Write results to a CSV file

In [17]:
y_pred['ID'] = data['id']
y_pred = y_pred[['ID', 'MIN', 'MAX']]
y_pred.head()

Unnamed: 0,ID,MIN,MAX
0,28807,1059.01,1219.12
1,22559,350.05,371.56
2,28647,790.02,790.02
3,22141,534.08,583.09
4,26116,1389.71,1587.24


In [18]:
y_pred.to_csv('predictions_whole_training_set.csv', index = False)