In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

pd.set_option("display.max_columns", None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,brand_name,model,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height
0,oneplus,OnePlus 11 5G,54999,89.0,True,True,False,snapdragon,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,android,50.0,16.0,0,,1440,3216
1,oneplus,OnePlus Nord CE 2 Lite 5G,19989,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,android,64.0,16.0,1,1024.0,1080,2412
2,samsung,Samsung Galaxy A14 5G,16499,75.0,True,False,False,exynos,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,android,50.0,13.0,1,1024.0,1080,2408
3,motorola,Motorola Moto G62 5G,14999,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,,6.0,128.0,6.55,120,3,1.0,android,50.0,16.0,1,1024.0,1080,2400
4,realme,Realme 10 Pro Plus,24999,82.0,True,False,False,dimensity,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,android,108.0,16.0,0,,1080,2412


In [3]:
df.drop(df[df["price"]==650000].index, axis=0, inplace=True)

In [4]:
df.drop(["model", "extended_upto"], axis=1, inplace=True)

In [5]:
df["brand_name"] = df["brand_name"].replace({"oneplus":"oppo",
                                             "realme":"xiaomi", 
                                             "redmi":"xiaomi",
                                             "poco":"xiaomi"})

In [6]:
df_copy = df.copy()
temp = df_copy.groupby(["brand_name"])["price"].mean()
df_copy = df_copy.merge(temp.reset_index(), how="left", on="brand_name")

In [7]:
dct = {}
for i in range (df_copy["price_y"].nunique()): 
    if df_copy["brand_name"].unique()[i] not in dct:
        dct[df_copy["brand_name"].unique()[i]] = df_copy["price_y"].unique()[i]

In [8]:
bins = [0, 25000, 50000, 130000]
label = ["budget_friendly", "middle_range", "expensive"]
df["category"] = pd.cut(df_copy["price_y"], bins, right=False, labels=label)

In [9]:
df.head()

Unnamed: 0,brand_name,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,resolution_width,resolution_height,category
0,oppo,54999,89.0,True,True,False,snapdragon,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,android,50.0,16.0,0,1440,3216,middle_range
1,oppo,19989,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,android,64.0,16.0,1,1080,2412,middle_range
2,samsung,16499,75.0,True,False,False,exynos,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,android,50.0,13.0,1,1080,2408,middle_range
3,motorola,14999,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,,6.0,128.0,6.55,120,3,1.0,android,50.0,16.0,1,1080,2400,budget_friendly
4,xiaomi,24999,82.0,True,False,False,dimensity,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,android,108.0,16.0,0,1080,2412,budget_friendly


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 979 entries, 0 to 979
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   brand_name                 979 non-null    object  
 1   price                      979 non-null    int64   
 2   rating                     878 non-null    float64 
 3   has_5g                     979 non-null    bool    
 4   has_nfc                    979 non-null    bool    
 5   has_ir_blaster             979 non-null    bool    
 6   processor_brand            959 non-null    object  
 7   num_cores                  973 non-null    float64 
 8   processor_speed            937 non-null    float64 
 9   battery_capacity           968 non-null    float64 
 10  fast_charging_available    979 non-null    int64   
 11  fast_charging              769 non-null    float64 
 12  ram_capacity               979 non-null    float64 
 13  internal_memory            979 non-null 

In [11]:
df["fast_charging_available"] = df["fast_charging_available"].astype("bool")
df["extended_memory_available"] = df["extended_memory_available"].astype("bool")
df["category"] = df["category"].astype("object")

In [12]:
num_col = df.select_dtypes(exclude=["object", "bool"]).columns
cat_col = df.select_dtypes(include=["object", "bool"]).columns

In [13]:
imputer_mean = SimpleImputer(strategy="mean")
imputer_median = SimpleImputer(strategy="median")
imputer_mode = SimpleImputer(strategy="most_frequent")

df["rating"] = imputer_mean.fit_transform(df["rating"].values.reshape(-1, 1))
df["processor_brand"] = imputer_mode.fit_transform(df["processor_brand"].values.reshape(-1, 1)).reshape(-1)
df["num_cores"] = imputer_median.fit_transform(df["num_cores"].values.reshape(-1, 1))
df["processor_speed"] = imputer_mean.fit_transform(df["processor_speed"].values.reshape(-1, 1))
df["battery_capacity"] = imputer_mean.fit_transform(df["battery_capacity"].values.reshape(-1, 1))
df["fast_charging"] = imputer_mean.fit_transform(df["fast_charging"].values.reshape(-1, 1))
df["num_front_cameras"] = imputer_median.fit_transform(df["num_front_cameras"].values.reshape(-1, 1))
df["os"] = imputer_mode.fit_transform(df["os"].values.reshape(-1, 1)).reshape(-1)
df["primary_camera_front"] = imputer_median.fit_transform(df["primary_camera_front"].values.reshape(-1, 1))

In [14]:
for i in range (len(cat_col)):
    for j in range (i+1, len(cat_col)):
        cont_table = pd.crosstab(df[cat_col[i]], df[cat_col[j]])
        chi2, p, dof, ex = chi2_contingency(cont_table)
        print(f"Chi-square test between {cat_col[i]} and {cat_col[j]}: p-value: {p}")

Chi-square test between brand_name and has_5g: p-value: 1.0425532823486493e-15
Chi-square test between brand_name and has_nfc: p-value: 1.7757923145284457e-39
Chi-square test between brand_name and has_ir_blaster: p-value: 1.1544505926318781e-54
Chi-square test between brand_name and processor_brand: p-value: 0.0
Chi-square test between brand_name and fast_charging_available: p-value: 1.0469690197120382e-23
Chi-square test between brand_name and os: p-value: 6.2699228057520695e-233
Chi-square test between brand_name and extended_memory_available: p-value: 2.0705117745449093e-31
Chi-square test between brand_name and category: p-value: 4.53015183021637e-60
Chi-square test between has_5g and has_nfc: p-value: 2.4854431771619072e-51
Chi-square test between has_5g and has_ir_blaster: p-value: 0.0013666967311859574
Chi-square test between has_5g and processor_brand: p-value: 9.636272516355459e-100
Chi-square test between has_5g and fast_charging_available: p-value: 3.989111800502643e-28
Chi

In [15]:
dummies = pd.get_dummies(data=df[cat_col], drop_first=True, dtype="int8")
dummies.shape

(979, 61)

In [16]:
dummies = dummies.astype("int64")

In [17]:
dataset = pd.concat([df[cat_col], dummies], axis=1)
dataset.shape

(979, 70)

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["features"] = dummies.columns
vif_data["vif"] = [variance_inflation_factor(dummies.values, i) for i in range(dummies.shape[1])]

necessary_cat_col = vif_data[vif_data["vif"]<10]["features"]

In [19]:
necessary_cat_col = list(necessary_cat_col)
necessary_cat_col.remove("has_ir_blaster")

In [20]:
necessary_cat_col

['has_5g',
 'has_nfc',
 'fast_charging_available',
 'extended_memory_available',
 'processor_brand_fusion',
 'os_ios',
 'os_other',
 'category_expensive',
 'category_middle_range']

In [21]:
necessary_num_col = ["price", "rating", "processor_speed", "fast_charging", "ram_capacity",
                 "internal_memory", "screen_size", "primary_camera_front", "resolution_height"]

In [22]:
data = pd.concat([df[necessary_num_col], dummies[necessary_cat_col]], axis=1)

In [23]:
ss_price = StandardScaler()
ss_feature = StandardScaler()
features = ["rating", "processor_speed", "fast_charging", "ram_capacity",
                 "internal_memory", "screen_size", "primary_camera_front", "resolution_height"]

data[features] = ss_feature.fit_transform(data[features])
# data["price"] = ss_price.fit_transform(data["price"].values.reshape(-1, 1))

In [24]:
test = data.iloc[-10:]
data = data.iloc[:-10]

In [25]:
y = data["price"]
X = data.drop("price", axis=1)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.82, random_state=42)

In [27]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
prediction = model.predict(X_test)
score = r2_score(y_test, prediction)
print(f"r2_score is: {score}")
score_train = model.score(X_train, y_train)
print(f"Training score is: {score_train}")

r2_score is: 0.41578418618291535
Training score is: 0.979778018436038


In [28]:
test_y = test["price"]
test_x = test.drop("price", axis=1)

In [29]:
preds = model.predict(test_x)

In [30]:
# preds = pd.DataFrame(preds)

In [31]:
# preds_reverse = ss_price.inverse_transform(preds.values.reshape(-1, 1))

In [32]:
r2_score(test_y, preds)

0.9191995738476946

In [33]:
test_y

970      8720
971     69990
972      4787
973     70990
974    119990
975     34990
976     14990
977     28990
978     19990
979     24990
Name: price, dtype: int64

In [34]:
list(preds)

[9494.09,
 45878.47,
 7246.72,
 86613.57,
 116078.79,
 27812.55,
 16244.903333333332,
 29232.48,
 27652.95,
 25354.58]

In [35]:
# test_y_reverse = ss_price.inverse_transform(test_y.values.reshape(-1, 1))

In [36]:
# test_y_reverse

In [37]:
# preds_reverse

In [38]:
# r2_score(test_y_reverse, preds_reverse)