In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("smartphone_cleaned_model.csv")

In [4]:
df.head()

Unnamed: 0,brand_names,model,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,...,screen_size,resolution,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto
0,oppo,OPPO A59 5G,14999,71.0,True,False,False,dimensity,8.0,2.2,...,6.56,720 x 1612,90.0,2.0,1.0,android,13.0,8.0,1.0,1024.0
1,xiaomi,Xiaomi Redmi 13C 5G,10999,74.0,True,False,False,dimensity,8.0,2.2,...,6.74,720 x 1600,90.0,2.0,1.0,android,50.0,5.0,1.0,1024.0
2,motorola,Motorola Edge 40 Neo,22999,84.0,True,True,False,dimensity,8.0,2.5,...,6.55,1080 x 2400,144.0,2.0,1.0,android,50.0,32.0,0.0,1024.0
3,motorola,Motorola Moto G54 5G,13999,84.0,True,False,False,dimensity,8.0,2.2,...,6.5,1080 x 2400,120.0,2.0,1.0,android,50.0,16.0,1.0,1024.0
4,oneplus,OnePlus Nord CE 3 5G,24679,84.0,True,True,True,snapdragon,8.0,2.7,...,6.7,1080 x 2412,120.0,3.0,1.0,android,50.0,16.0,1.0,1024.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 0 to 840
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   brand_names                841 non-null    object 
 1   model                      841 non-null    object 
 2   price                      841 non-null    int64  
 3   rating                     841 non-null    float64
 4   has_5g                     841 non-null    bool   
 5   has_nfc                    841 non-null    bool   
 6   has_ir_blaster             841 non-null    bool   
 7   processor_brand            841 non-null    object 
 8   num_cores                  841 non-null    float64
 9   processor_speed            841 non-null    float64
 10  ram_capacity               841 non-null    float64
 11  internal_memory            841 non-null    float64
 12  battery_capacity           841 non-null    float64
 13  fast_charging_available    841 non-null    float64

In [6]:
df.drop(columns=['model'], inplace=True)

In [7]:
df['has_5g'] = df['has_5g'].astype(int)
df['has_nfc'] = df['has_nfc'].astype(int)
df['has_ir_blaster'] = df['has_ir_blaster'].astype(int)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 0 to 840
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   brand_names                841 non-null    object 
 1   price                      841 non-null    int64  
 2   rating                     841 non-null    float64
 3   has_5g                     841 non-null    int32  
 4   has_nfc                    841 non-null    int32  
 5   has_ir_blaster             841 non-null    int32  
 6   processor_brand            841 non-null    object 
 7   num_cores                  841 non-null    float64
 8   processor_speed            841 non-null    float64
 9   ram_capacity               841 non-null    float64
 10  internal_memory            841 non-null    float64
 11  battery_capacity           841 non-null    float64
 12  fast_charging_available    841 non-null    float64
 13  fast_charging              841 non-null    float64

In [9]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression , Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [14]:
X = df.drop(columns=['price'])
y = df['price']
y_transformed = np.log1p(y)

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['rating', 'has_5g','num_cores',
       'processor_speed', 'ram_capacity', 'internal_memory','fast_charging',
       'screen_size', 'refresh_rate','primary_camera_rear', 'primary_camera_front']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['brand_names','processor_brand','os','resolution'])
    ]

)

In [16]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
    ])
   
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [17]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [18]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))









In [19]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
7,gradient boosting,0.932513,2998.874582
6,extra trees,0.942687,3009.013172
10,xgboost,0.935924,3017.15017
5,random forest,0.92198,3029.222781
0,linear_reg,0.911322,4010.470946
4,decision tree,0.88061,4158.033241
2,ridge,0.917071,4167.81333
1,svr,0.907274,4264.476867
8,adaboost,0.868362,4722.909778
9,mlp,0.535043,7949.730181


# Hyperparameter tuning

In [20]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}


In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['rating', 'has_5g','num_cores',
       'processor_speed', 'ram_capacity', 'internal_memory','fast_charging',
       'screen_size', 'refresh_rate','primary_camera_rear', 'primary_camera_front']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['brand_names','processor_brand','os','resolution'])
    ]

)

In [22]:
 pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',ExtraTreesRegressor())
    ])

In [23]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)
search.fit(X, y_transformed)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits
