# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

from sklearn.neighbors import KNeighborsRegressor

pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings("ignore")

# Reading the Dataset

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,brand_name,model,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height
0,oneplus,OnePlus 11 5G,54999,89.0,True,True,False,snapdragon,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,android,50.0,16.0,0,,1440,3216
1,oneplus,OnePlus Nord CE 2 Lite 5G,19989,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,android,64.0,16.0,1,1024.0,1080,2412
2,samsung,Samsung Galaxy A14 5G,16499,75.0,True,False,False,exynos,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,android,50.0,13.0,1,1024.0,1080,2408
3,motorola,Motorola Moto G62 5G,14999,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,,6.0,128.0,6.55,120,3,1.0,android,50.0,16.0,1,1024.0,1080,2400
4,realme,Realme 10 Pro Plus,24999,82.0,True,False,False,dimensity,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,android,108.0,16.0,0,,1080,2412


# Data Preprocessing 

In [3]:
df.drop(df[df["price"]==650000].index, axis=0, inplace=True)

In [4]:
df.drop(["model", "extended_upto"], axis=1, inplace=True)

In [5]:
df["brand_name"] = df["brand_name"].replace({"oneplus":"oppo",
                                             "realme":"xiaomi", 
                                             "redmi":"xiaomi",
                                             "poco":"xiaomi"})

### Generation of a column

In [6]:
df_copy = df.copy()
temp = df_copy.groupby(["brand_name"])["price"].mean()
df_copy = df_copy.merge(temp.reset_index(), how="left", on="brand_name")

In [7]:
dct = {}
for i in range (df_copy["price_y"].nunique()): 
    if df_copy["brand_name"].unique()[i] not in dct:
        dct[df_copy["brand_name"].unique()[i]] = df_copy["price_y"].unique()[i]

In [8]:
bins = [0, 25000, 50000, 130000]
label = ["budget_friendly", "middle_range", "expensive"]
df["category"] = pd.cut(df_copy["price_y"], bins, right=False, labels=label)

In [9]:
df["fast_charging_available"] = df["fast_charging_available"].astype("bool")
df["extended_memory_available"] = df["extended_memory_available"].astype("bool")
df["category"] = df["category"].astype("object")

In [10]:
num_col = df.select_dtypes(exclude=["object", "bool"]).columns
cat_col = df.select_dtypes(include=["object", "bool"]).columns

### Imputing 

In [11]:
imputer_mean = SimpleImputer(strategy="mean")
imputer_median = SimpleImputer(strategy="median")
imputer_mode = SimpleImputer(strategy="most_frequent")

df["rating"] = imputer_mean.fit_transform(df["rating"].values.reshape(-1, 1))
df["processor_brand"] = imputer_mode.fit_transform(df["processor_brand"].values.reshape(-1, 1)).reshape(-1)
df["num_cores"] = imputer_median.fit_transform(df["num_cores"].values.reshape(-1, 1))
df["processor_speed"] = imputer_mean.fit_transform(df["processor_speed"].values.reshape(-1, 1))
df["battery_capacity"] = imputer_mean.fit_transform(df["battery_capacity"].values.reshape(-1, 1))
df["fast_charging"] = imputer_mean.fit_transform(df["fast_charging"].values.reshape(-1, 1))
df["num_front_cameras"] = imputer_median.fit_transform(df["num_front_cameras"].values.reshape(-1, 1))
df["os"] = imputer_mode.fit_transform(df["os"].values.reshape(-1, 1)).reshape(-1)
df["primary_camera_front"] = imputer_median.fit_transform(df["primary_camera_front"].values.reshape(-1, 1))

### Encoding 

In [12]:
df_encoded = pd.get_dummies(df, drop_first=True, dtype="int32")

In [13]:
data = df_encoded[['rating', 'processor_speed', 'fast_charging', 'ram_capacity', 'internal_memory',
                'screen_size', 'primary_camera_front', 'resolution_height', 'has_nfc', 'os_ios', "price"]]

In [14]:
data["has_nfc"] = data["has_nfc"].astype("int64")

In [15]:
# test = data.iloc[-10:]
# data = data.iloc[:-10]

### Scaling 

In [16]:
test = data.iloc[np.arange(0, df.shape[0], 150)]
data = data.drop(np.arange(0, df.shape[0], 150), axis=0)

In [17]:
# ss = StandardScaler()
# data[["rating", "processor_speed", "fast_charging", "ram_capacity",
#         "internal_memory", "screen_size", "primary_camera_front", "resolution_height"]] = ss.fit_transform(data[["rating", "processor_speed", "fast_charging", "ram_capacity",
#         "internal_memory", "screen_size", "primary_camera_front", "resolution_height"]])

In [18]:
# test[["rating", "processor_speed", "fast_charging", "ram_capacity",
#         "internal_memory", "screen_size", "primary_camera_front", "resolution_height"]] = ss.fit_transform(test[["rating", "processor_speed", "fast_charging", "ram_capacity",
#         "internal_memory", "screen_size", "primary_camera_front", "resolution_height"]])

In [19]:
test_y = test["price"]
test_x = test.drop("price", axis=1)

In [20]:
y = data["price"]
X = data.drop("price", axis=1)

### Splitting the Training and Test Sets

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [22]:
X_train

Unnamed: 0,rating,processor_speed,fast_charging,ram_capacity,internal_memory,screen_size,primary_camera_front,resolution_height,has_nfc,os_ios
854,84.0,2.05,120.000000,8.0,128.0,6.70,16.0,2400,0,0
358,84.0,2.40,33.000000,8.0,128.0,6.40,16.0,2400,0,0
462,70.0,1.60,10.000000,4.0,64.0,6.52,8.0,1600,0,0
532,89.0,3.00,65.000000,8.0,128.0,6.80,16.0,2480,1,0
470,86.0,2.96,25.000000,6.0,128.0,6.71,10.0,3200,1,0
...,...,...,...,...,...,...,...,...,...,...
107,86.0,3.00,25.000000,8.0,128.0,6.10,10.0,2340,1,0
272,77.0,2.20,18.000000,6.0,128.0,6.58,5.0,2408,0,0
867,64.0,2.30,46.126138,3.0,32.0,6.52,5.0,1600,0,0
439,80.0,2.00,15.000000,6.0,128.0,6.40,20.0,2400,0,0


# Model Development 

In [23]:
# parameters_knn = {"n_neighbors": [5, 10, 15, 20],
#                  "weights": ["uniform", "distance"],
#                  "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
#                  "leaf_size": [20, 30, 40]}
# gscv_knn = GridSearchCV(KNeighborsRegressor(), param_grid=parameters_knn, n_jobs=-1)

In [24]:
# gscv_knn.fit(X_train, y_train)

In [25]:
# gscv_knn.best_params_

In [26]:
knn = KNeighborsRegressor(n_neighbors=10, weights="distance", leaf_size=40, algorithm="ball_tree")
knn.fit(X_train, y_train)

In [27]:
pr = knn.predict(X_test)
r2_score(y_test, pr)

0.7443257955159903

In [28]:
pr_1 = knn.predict(test_x)

In [29]:
r2_score(test_y, pr_1)

0.8342569587404735

In [30]:
compare = pd.DataFrame()
compare["Actual"] = test_y
compare["Predicted"] = pr_1
compare

Unnamed: 0,Actual,Predicted
0,54999,69752.933312
150,19988,13807.430626
300,6299,6136.109663
451,7999,7999.0
601,27999,25743.173884
751,24990,24990.0
901,19999,19999.0
