In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import joblib

In [3]:
dataset=pd.read_csv("C:\\Users\\Nasr\\Downloads\\SpecsData.csv")
dataset.head(10)

Unnamed: 0,Phone Company,Model Name,RAM,Storage,Processor,Battery Capacity,Operating System,External Condition,5g Support,PTA Approved,Year of Release,Original Price,Current Price
0,Oneplus,N10 (BE2028),6,128,Snapdragon,4300,Android,Good,Yes,Yes,2020,40900,29000
1,Samsung,A14,4,64,Exynos,5000,Android,Good,No,Yes,2023,33000,30000
2,Apple,iphone 13,4,256,Apple Bionic,3240,iOS,Good,Yes,Yes,2021,330000,190000
3,Samsung,S24,8,256,Snapdragon,4000,Android,Good,Yes,No,2024,290000,219000
4,Xiaomi,Redmi Note 11,4,128,Snapdragon,5000,Android,Good,No,Yes,2022,60000,31500
5,Samsung,S22 Ultra,12,256,Snapdragon,5000,Android,Good,Yes,No,2022,425000,190000
6,Samsung,A50,4,128,Exynos,4000,Android,Good,Yes,Yes,2019,49000,24000
7,Apple,iPhone 14ProMax,6,256,Apple Bionic,4323,iOS,Good,Yes,Yes,2022,520000,340000
8,Apple,iphone X,3,64,Apple Bionic,2716,iOS,Good,No,Yes,2017,145000,54000
9,Vivo,Y53S,8,128,Mediatek Helio,5000,Android,Good,No,Yes,2021,41000,35000


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Phone Company       1000 non-null   object
 1   Model Name          1000 non-null   object
 2   RAM                 1000 non-null   int64 
 3   Storage             1000 non-null   int64 
 4   Processor           1000 non-null   object
 5   Battery Capacity    1000 non-null   int64 
 6   Operating System    1000 non-null   object
 7   External Condition  1000 non-null   object
 8   5g Support          1000 non-null   object
 9   PTA Approved        1000 non-null   object
 10  Year of Release     1000 non-null   int64 
 11  Original Price      1000 non-null   int64 
 12  Current Price       1000 non-null   int64 
dtypes: int64(6), object(7)
memory usage: 101.7+ KB


In [5]:
dataset.nunique()

Phone Company          13
Model Name            438
RAM                     8
Storage                 8
Processor               9
Battery Capacity       87
Operating System        2
External Condition      3
5g Support              2
PTA Approved            2
Year of Release        12
Original Price        245
Current Price         129
dtype: int64

In [6]:
dataset['Phone Company'].value_counts()

Phone Company
Samsung    224
Apple      184
Vivo       138
Oneplus     98
Google      83
Xiaomi      76
Infinix     45
Oppo        32
Tecno       31
LG          25
Huawei      22
Poco        22
Realme      20
Name: count, dtype: int64

In [7]:
dataset['Processor'].value_counts()

Processor
Snapdragon            319
Mediatek Helio        218
Apple Bionic          176
Exynos                143
Mediatek Dimensity     59
Google Tensor          41
Unisoc                 21
Kirin                  15
Apple Pro               8
Name: count, dtype: int64

In [8]:
dataset=dataset.drop(['Model Name'],axis=1)

In [9]:
X= dataset.iloc[:, :-1].values
X

array([['Oneplus', 6, 128, ..., 'Yes', 2020, 40900],
       ['Samsung', 4, 64, ..., 'Yes', 2023, 33000],
       ['Apple', 4, 256, ..., 'Yes', 2021, 330000],
       ...,
       ['Poco', 4, 64, ..., 'Yes', 2024, 35000],
       ['Poco', 4, 64, ..., 'Yes', 2022, 30000],
       ['Huawei', 6, 128, ..., 'Yes', 2023, 46000]], dtype=object)

In [10]:
y= dataset.iloc[:,-1].values

In [11]:
labelencode_X=LabelEncoder()
(X[:,0])=labelencode_X.fit_transform(X[:,0])
(X[:,3])=labelencode_X.fit_transform(X[:,3])
(X[:,5])=labelencode_X.fit_transform(X[:,5])
(X[:,6])=labelencode_X.fit_transform(X[:,6])
(X[:,7])=labelencode_X.fit_transform(X[:,7])
(X[:,8])=labelencode_X.fit_transform(X[:,8])

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [13]:
scoring = {
    'neg_mean_squared_error': 'neg_mean_squared_error',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

In [14]:
dt_params = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [15]:
regressorDT = RandomizedSearchCV(
    estimator=DecisionTreeRegressor(),
    param_distributions=dt_params,
    n_iter=10,
    scoring=scoring,
    refit="r2",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [16]:
regressorDT.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [17]:
y_pred=regressorDT.predict(X_test)

In [18]:
print("R2 Score is : ",r2_score(y_test, y_pred))
print("MSE is : ",mean_squared_error(y_test, y_pred))
print("MAE is : ",mean_absolute_error(y_test, y_pred))

R2 Score is :  0.6611630063828754
MSE is :  793462446.8537414
MAE is :  14616.011904761903


In [19]:
print("Best parameters for Decision Tree:", regressorDT.best_params_)
print("Best CV score for Decision Tree:", regressorDT.best_score_)

Best parameters for Decision Tree: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 20}
Best CV score for Decision Tree: 0.8029121319627048


In [20]:
rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

In [21]:
regressorRF = RandomizedSearchCV(
    estimator=RandomForestRegressor(),
    param_distributions=rf_params,
    n_iter=10,
    scoring=scoring,
    refit="r2",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [22]:
regressorRF.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [23]:
y_pred=regressorRF.predict(X_test)

In [24]:
print("R2 Score is : ",r2_score(y_test, y_pred))
print("MSE is : ",mean_squared_error(y_test, y_pred))
print("MAE is : ",mean_absolute_error(y_test, y_pred))

R2 Score is :  0.8578058614140072
MSE is :  332979312.34212136
MAE is :  11145.507693440379


In [25]:
print("Best parameters for Random Forest:", regressorRF.best_params_)
print("Best CV score for Random Forest:", regressorRF.best_score_)

Best parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10}
Best CV score for Random Forest: 0.8619837767602588


In [26]:
knn_params = {
    "n_neighbors": [3, 5, 7, 10],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

In [27]:
regressorKNN = RandomizedSearchCV(
    estimator=KNeighborsRegressor(),
    param_distributions=knn_params,
    n_iter=10,
    scoring=scoring,
    refit="r2",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [28]:
regressorKNN.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [29]:
y_pred=regressorKNN.predict(X_test)

In [30]:
print("R2 Score is : ",r2_score(y_test, y_pred))
print("MSE is : ",mean_squared_error(y_test, y_pred))
print("MAE is : ",mean_absolute_error(y_test, y_pred))

R2 Score is :  0.7613691110070383
MSE is :  558807487.5
MAE is :  14691.75


In [31]:
print("Best parameters for KNN:", regressorKNN.best_params_)
print("Best CV score for KNN:", regressorKNN.best_score_)

Best parameters for KNN: {'weights': 'uniform', 'n_neighbors': 10, 'metric': 'manhattan'}
Best CV score for KNN: 0.6759789473402317


In [32]:
xgb_params = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 1]
}

In [33]:
regressorXGB = RandomizedSearchCV(
    estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
    param_distributions=xgb_params,
    n_iter=10,
    scoring=scoring,
    refit="r2",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [34]:
regressorXGB.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [35]:
y_pred=regressorXGB.predict(X_test)

In [36]:
print("R2 Score is : ",r2_score(y_test, y_pred))
print("MSE is : ",mean_squared_error(y_test, y_pred))
print("MAE is : ",mean_absolute_error(y_test, y_pred))

R2 Score is :  0.8815225615940244
MSE is :  277441365.45136005
MAE is :  9774.559858398437


In [37]:
print("Best parameters for XGBoost:", regressorXGB.best_params_)
print("Best CV score for XGBoost:", regressorXGB.best_score_)

Best parameters for XGBoost: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1}
Best CV score for XGBoost: 0.8812438156260285


In [38]:
adaboost_params = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 1]
}

In [39]:
regressorADA = RandomizedSearchCV(
    estimator=AdaBoostRegressor(),
    param_distributions=adaboost_params,
    n_iter=10,
    scoring=scoring,
    refit="r2",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [40]:
regressorADA.fit(X_train,y_train)



Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [41]:
y_pred=regressorADA.predict(X_test)

In [42]:
print("R2 Score is : ",r2_score(y_test, y_pred))
print("MSE is : ",mean_squared_error(y_test, y_pred))
print("MAE is : ",mean_absolute_error(y_test, y_pred))

R2 Score is :  0.7748989903384012
MSE is :  527124255.26110876
MAE is :  15926.96801224831


In [43]:
print("Best parameters for AdaBoost:", regressorADA.best_params_)
print("Best CV score for AdaBoost:", regressorADA.best_score_)

Best parameters for AdaBoost: {'n_estimators': 100, 'learning_rate': 0.1}
Best CV score for AdaBoost: 0.7859400363355695


In [None]:
joblib.dump(regressorXGB,'model.pkl')