In [8]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [9]:
df = pd.read_csv("data/Mobile Price Classification/train.csv")
df.insert(0, "id", range(1,2001))
df.set_index("id", inplace=True)
df.head()

Unnamed: 0_level_0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
2,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
3,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
4,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
5,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [10]:
X = df.drop(columns="price_range")
y = df.price_range

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1600, 20), (400, 20), (1600,), (400,))

In [19]:
X.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')

In [11]:
numerical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="mean")),
])

categorical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['battery_power', 'clock_speed', 'fc',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']),
    ("categorical", categorical_pipeline,['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi'])
])

In [15]:
gsp.knn_params

{'algo__n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]),
 'algo__weights': ['uniform', 'distance'],
 'algo__p': [1, 1.5, 2]}

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier())
])

params = {
    'algo__n_neighbors': [5]
}

model = GridSearchCV(pipeline, params, cv=2 ,n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 2 folds for each of 1 candidates, totalling 2 fits
{'algo__n_neighbors': 5}
0.94625 0.9043749999999999 0.935


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s finished


In [25]:
pd.DataFrame(model.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__n_neighbors,param_algo__p,param_algo__weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
41,0.015621,5.150430e-07,0.015621,0.000000e+00,13,2,distance,"{'algo__n_neighbors': 13, 'algo__p': 2, 'algo_...",0.940075,0.924953,0.921201,0.928743,0.008158,1
35,0.020828,7.363897e-03,0.015621,1.946680e-07,11,2,distance,"{'algo__n_neighbors': 11, 'algo__p': 2, 'algo_...",0.936330,0.921201,0.926829,0.928120,0.006243,2
33,0.015622,1.123916e-07,0.093728,1.946680e-07,11,1.5,distance,"{'algo__n_neighbors': 11, 'algo__p': 1.5, 'alg...",0.938202,0.919325,0.926829,0.928119,0.007761,3
34,0.015622,2.247832e-07,0.031243,2.247832e-07,11,2,uniform,"{'algo__n_neighbors': 11, 'algo__p': 2, 'algo_...",0.932584,0.921201,0.926829,0.926871,0.004647,4
32,0.015621,1.123916e-07,0.104142,7.364010e-03,11,1.5,uniform,"{'algo__n_neighbors': 11, 'algo__p': 1.5, 'alg...",0.936330,0.919325,0.924953,0.926869,0.007073,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,0.062484,0.000000e+00,0.052073,7.364459e-03,1,1.5,distance,"{'algo__n_neighbors': 1, 'algo__p': 1.5, 'algo...",0.904494,0.904315,0.889306,0.899372,0.007118,85
4,0.020829,7.364010e-03,0.036448,7.362830e-03,1,2,uniform,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.900749,0.898687,0.893058,0.897498,0.003250,87
5,0.020827,7.363448e-03,0.005206,7.362998e-03,1,2,distance,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.900749,0.898687,0.893058,0.897498,0.003250,87
1,0.062484,0.000000e+00,0.020831,7.363111e-03,1,1,distance,"{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__...",0.898876,0.893058,0.889306,0.893747,0.003937,89
