### **1. Loading Data/Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib, json, os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('laptop_data.csv')

In [3]:
df.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0


In [4]:
print(df.shape)

(823, 19)


### **2. Data Checks**

In [5]:
df.isna().sum()

Unnamed: 0,0
brand,0
processor_brand,0
processor_name,0
processor_gnrtn,0
ram_gb,0
ram_type,0
ssd,0
hdd,0
os,0
os_bit,0


In [6]:
df.duplicated().sum()

np.int64(21)

In [7]:
df = df.drop_duplicates()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 802 entries, 0 to 822
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              802 non-null    object
 1   processor_brand    802 non-null    object
 2   processor_name     802 non-null    object
 3   processor_gnrtn    802 non-null    object
 4   ram_gb             802 non-null    object
 5   ram_type           802 non-null    object
 6   ssd                802 non-null    object
 7   hdd                802 non-null    object
 8   os                 802 non-null    object
 9   os_bit             802 non-null    object
 10  graphic_card_gb    802 non-null    object
 11  weight             802 non-null    object
 12  warranty           802 non-null    object
 13  Touchscreen        802 non-null    object
 14  msoffice           802 non-null    object
 15  Price              802 non-null    int64 
 16  rating             802 non-null    object
 17  Nu

In [9]:
df.nunique()

Unnamed: 0,0
brand,8
processor_brand,3
processor_name,11
processor_gnrtn,8
ram_gb,4
ram_type,6
ssd,7
hdd,4
os,3
os_bit,2


In [10]:
df.describe()

Unnamed: 0,Price,Number of Ratings,Number of Reviews
count,802.0,802.0,802.0
mean,76625.543641,299.84414,36.089776
std,45232.984422,1001.78442,118.313553
min,16990.0,0.0,0.0
25%,45990.0,0.0,0.0
50%,63990.0,17.0,2.0
75%,89525.0,140.25,18.0
max,441990.0,15279.0,1947.0


In [11]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'object']
cat_features = [feature for feature in df.columns if df[feature].dtype == 'object']
print("Numerical features: ", numeric_features)
print("Categorical featues:", cat_features)

Numerical features:  ['Price', 'Number of Ratings', 'Number of Reviews']
Categorical featues: ['brand', 'processor_brand', 'processor_name', 'processor_gnrtn', 'ram_gb', 'ram_type', 'ssd', 'hdd', 'os', 'os_bit', 'graphic_card_gb', 'weight', 'warranty', 'Touchscreen', 'msoffice', 'rating']


### **3. Transformation Pipeline**

In [12]:
class TransformationPipeline:
    def __init__(self, df):
        self.df = df

    def preprocess(self):
        cat_cols = self.df.select_dtypes(include='object').columns.tolist()
        num_cols = ['Number of Ratings', 'Number of Reviews']
        num_pipeline = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler",  StandardScaler())
        ])
        cat_pipeline = Pipeline(steps=[
            ("imputer",        SimpleImputer(strategy="most_frequent")),
            ("one_hot",        OneHotEncoder(handle_unknown='ignore')),
            ("scale_no_mean",  StandardScaler(with_mean=False))
        ])
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", num_pipeline, num_cols),
                ("cat", cat_pipeline, cat_cols)
            ],
            remainder='drop'
        )
        return preprocessor

In [13]:
X = df.drop('Price', axis = 1)
y = df.Price

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state= 1)
X_train.shape, X_test.shape

((721, 18), (81, 18))

In [15]:
tp = TransformationPipeline(df)
preprocessor = tp.preprocess()

In [16]:
X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)


In [17]:
feature_names = preprocessor.get_feature_names_out()

In [18]:
import pandas as pd
X_df = pd.DataFrame.sparse.from_spmatrix(X_train, columns=feature_names)
print(X_df.columns)
print(X_df.shape)
X_df.head()

Index(['num__Number of Ratings', 'num__Number of Reviews', 'cat__brand_APPLE',
       'cat__brand_ASUS', 'cat__brand_Avita', 'cat__brand_DELL',
       'cat__brand_HP', 'cat__brand_Lenovo', 'cat__brand_MSI',
       'cat__brand_acer', 'cat__processor_brand_AMD',
       'cat__processor_brand_Intel', 'cat__processor_brand_M1',
       'cat__processor_name_Celeron Dual', 'cat__processor_name_Core i3',
       'cat__processor_name_Core i5', 'cat__processor_name_Core i7',
       'cat__processor_name_Core i9', 'cat__processor_name_M1',
       'cat__processor_name_Pentium Quad', 'cat__processor_name_Ryzen 3',
       'cat__processor_name_Ryzen 5', 'cat__processor_name_Ryzen 7',
       'cat__processor_name_Ryzen 9', 'cat__processor_gnrtn_10th',
       'cat__processor_gnrtn_11th', 'cat__processor_gnrtn_12th',
       'cat__processor_gnrtn_4th', 'cat__processor_gnrtn_7th',
       'cat__processor_gnrtn_8th', 'cat__processor_gnrtn_9th',
       'cat__processor_gnrtn_Not Available', 'cat__ram_gb_16 GB',
 

Unnamed: 0,num__Number of Ratings,num__Number of Reviews,cat__brand_APPLE,cat__brand_ASUS,cat__brand_Avita,cat__brand_DELL,cat__brand_HP,cat__brand_Lenovo,cat__brand_MSI,cat__brand_acer,...,cat__warranty_No warranty,cat__Touchscreen_No,cat__Touchscreen_Yes,cat__msoffice_No,cat__msoffice_Yes,cat__rating_1 star,cat__rating_2 stars,cat__rating_3 stars,cat__rating_4 stars,cat__rating_5 stars
0,-0.252577,-0.296979,0,0.0,0,0,0,2.593425,0,0,...,0.0,3.133184,0,0.0,2.111294,0,0,2.022395,0.0,0
1,-0.298887,-0.305165,0,2.17451,0,0,0,0.0,0,0,...,2.111294,3.133184,0,2.111294,0.0,0,0,2.022395,0.0,0
2,-0.298887,-0.305165,0,2.17451,0,0,0,0.0,0,0,...,2.111294,3.133184,0,2.111294,0.0,0,0,2.022395,0.0,0
3,-0.234246,-0.27242,0,0.0,0,0,0,2.593425,0,0,...,0.0,3.133184,0,0.0,2.111294,0,0,0.0,2.006279,0
4,-0.022956,-0.149625,0,0.0,0,0,0,2.593425,0,0,...,2.111294,3.133184,0,2.111294,0.0,0,0,0.0,2.006279,0


### **4. Model Training/Testing**

In [19]:
amodel = tf.keras.Sequential()

amodel.add(tf.keras.layers.Dense(79))
amodel.add(tf.keras.layers.Dense(200))
amodel.add(tf.keras.layers.Dense(200))
amodel.add(tf.keras.layers.Dense(200))
amodel.add(tf.keras.layers.Dense(1))

amodel.compile(
    loss = 'mse',
    optimizer = tf.keras.optimizers.Adam(),
    metrics = [tf.keras.metrics.RootMeanSquaredError(name='rmse')]
)

amodel.fit(X_train.toarray(), y_train, validation_data=(X_test.toarray(), y_test), epochs=50)

Epoch 1/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - loss: 8343044608.0000 - rmse: 91240.3281 - val_loss: 8652305408.0000 - val_rmse: 93017.7656
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 6518376960.0000 - rmse: 80636.6953 - val_loss: 3931725568.0000 - val_rmse: 62703.4727
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 2458427648.0000 - rmse: 49533.4453 - val_loss: 1534089728.0000 - val_rmse: 39167.4570
Epoch 4/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1541099776.0000 - rmse: 39184.4531 - val_loss: 1026130240.0000 - val_rmse: 32033.2676
Epoch 5/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1092965120.0000 - rmse: 32971.3555 - val_loss: 750804224.0000 - val_rmse: 27400.8066
Epoch 6/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1026955136.0000 -

<keras.src.callbacks.history.History at 0x780f461edf10>

In [20]:
RMSE_ann = amodel.evaluate(X_test.toarray(), y_test)[1]
RMSE_ann

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 1251394688.0000 - rmse: 35296.0625


33332.69921875

### **5. Model Export**

In [21]:
os.makedirs("model", exist_ok=True)

joblib.dump(preprocessor, "model/preprocessor.joblib")
amodel.save("model/laptop_price_model.h5")

RAW_COLS = X_df.columns.tolist()
json.dump({"raw_cols": RAW_COLS},
          open("model/meta.json", "w", encoding="utf-8"), indent=2)

