In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import  make_pipeline
import re
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../data/laptop_big-data.csv')

In [36]:
df.head(5)

Unnamed: 0,Brand,Price,Processor_Name,RAM_Expandable,RAM,RAM_TYPE,Ghz,Display_type,Display,GPU,GPU_Brand,SSD,HDD,Adapter
0,HP,22990,MediaTek Octa-core,Not Expandable,4 GB,DDR4 RAM,2.0 Ghz Processor,LED,11.6,Integrated Graphics,MediaTek,64 GB SSD Storage,No HDD,45.0
1,Lenovo,36289,AMD Hexa-Core Ryzen 5,12 GB Expandable,8 GB,DDR4 RAM,4.0 Ghz Processor,LCD,15.6,Radeon,AMD,512 GB SSD Storage,No HDD,65.0
2,Dell,78500,Intel Core i5 (12th Gen),32 GB Expandable,16 GB,DDR5 RAM,3.3 Ghz Processor,LCD,15.6,"GeForce RTX 3050 GPU, 4 GB",NVIDIA,512 GB SSD Storage,No HDD,56.0
3,HP,55490,Intel Core i5 (12th Gen),8 GB Expandable,8 GB,DDR4 RAM,4.2 Ghz Processor,LCD,15.6,Iris Xe,Intel,512 GB SSD Storage,No HDD,64.599598
4,Infinix,21990,Intel Core i3 (11th Gen),Not Expandable,8 GB LP,LPDDR4X RAM,1.7 Ghz Processor,LCD,15.6,UHD,Intel,512 GB SSD Storage,No HDD,45.0


In [39]:
df.shape

(3976, 14)

In [41]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):

    print(df['GPU_Brand'].value_counts())

GPU_Brand
Intel        1943
NVIDIA       1380
AMD           616
Apple          21
Nvidia          8
MediaTek        3
Qualcomm        1
NIVIDIA         1
ARM             1
Microsoft       1
ATI             1
Name: count, dtype: int64


In [6]:
new_row = df.iloc[1:2:]
new_row = new_row.drop('Price' , axis=1)

In [7]:
new_row

Unnamed: 0.1,Unnamed: 0,Brand,Name,Processor_Name,Processor_Brand,RAM_Expandable,RAM,RAM_TYPE,Ghz,Display_type,Display,GPU,GPU_Brand,SSD,HDD,Adapter,Battery_Life
1,1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN) Laptop (15....,AMD Hexa-Core Ryzen 5,AMD,12 GB Expandable,8 GB,DDR4 RAM,4.0 Ghz Processor,LCD,15.6,Radeon,AMD,512 GB SSD Storage,No HDD,65,Upto 11 Hrs Battery Life


In [8]:
df = df.drop('Unnamed: 0',axis=1)
df = df.drop('Battery_Life',axis=1)
df = df.drop('Name',axis=1)
df = df.drop('Processor_Brand',axis=1)
df['GPU'] = df['GPU'].fillna(df['GPU'].mode()[0])
df['GPU_Brand'] = df['GPU_Brand'].fillna(df['GPU_Brand'].mode()[0])
df['Display'] = df['Display'].str.replace('OLED Display With Touchscreen','15.6')
df['Display'] = df['Display'].astype(float)
df['Adapter'] = df['Adapter'].str.replace('no' , '0')
df['Adapter'] = pd.to_numeric(df['Adapter'])
df['Adapter'] = df['Adapter'].replace(0,df['Adapter'].mean())

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3976 entries, 0 to 3975
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Brand           3976 non-null   object 
 1   Price           3976 non-null   int64  
 2   Processor_Name  3976 non-null   object 
 3   RAM_Expandable  3976 non-null   object 
 4   RAM             3976 non-null   object 
 5   RAM_TYPE        3976 non-null   object 
 6   Ghz             3976 non-null   object 
 7   Display_type    3976 non-null   object 
 8   Display         3976 non-null   float64
 9   GPU             3976 non-null   object 
 10  GPU_Brand       3976 non-null   object 
 11  SSD             3976 non-null   object 
 12  HDD             3976 non-null   object 
 13  Adapter         3976 non-null   float64
dtypes: float64(2), int64(1), object(11)
memory usage: 435.0+ KB


In [10]:
class ProcessorFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Step 1: Clean text
        df['Processor_Name'] = df['Processor_Name'].str.lower()
        df['Processor_Name'] = df['Processor_Name'].str.replace(r'\s+', ' ', regex=True)
        df['Processor_Name'] = df['Processor_Name'].str.replace(r'\s*processor$', '', regex=True)
        df['Processor_Name'] = df['Processor_Name'].str.strip()

        # Step 2: CPU_Brand
        df['CPU_Brand'] = df['Processor_Name'].apply(
            lambda x: 'intel' if 'intel' in x else (
                'amd' if 'amd' in x else (
                    'apple' if 'apple' in x else 'other'
                )
            )
        )

        # Step 3: CPU_Series
        def extract_series(proc):
            match = re.search(r'(core\s*i\d|ryzen\s*\d|pentium|celeron|athlon|m\d|snapdragon|mediatek)', proc)
            return match.group(0) if match else 'other'

        df['CPU_Series'] = df['Processor_Name'].apply(extract_series)

        # Step 4: CPU_Gen
        df['CPU_Gen'] = df['Processor_Name'].str.extract(r'\((\d+)(?:st|nd|rd|th)\s*gen\)', expand=False)
        df['CPU_Gen'] = pd.to_numeric(df['CPU_Gen'], errors='coerce').fillna(0).astype(int)

        # Step 5: CPU_Cores
        def extract_core_count(proc):
            proc = proc.lower()
            if 'dual-core' in proc: return 2
            if 'quad-core' in proc: return 4
            if 'hexa-core' in proc: return 6
            if 'octa-core' in proc: return 8
            if 'hexadeca-core' in proc: return 16
            return 0

        df['CPU_Cores'] = df['Processor_Name'].apply(extract_core_count)

        # Step 6: Apple chip
        df['Is_Apple_Chip'] = df['Processor_Name'].str.contains('apple', case=False).astype(int)
        df['Apple_Chip_Type'] = df['Processor_Name'].str.extract(r'(m1|m2|max|pro)', expand=False).fillna('none')

        # Return only new columns
        return df[['CPU_Brand', 'CPU_Series', 'CPU_Gen', 'CPU_Cores', 'Is_Apple_Chip', 'Apple_Chip_Type']]


In [11]:
class CleanRam(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Use X.iloc[:, 0] to get the first (and only) column as a Series
        return X.iloc[:, 0].str.replace('RAM', '', regex=False)\
                           .str.replace('LP', '', regex=False)\
                           .str.replace('GB', '', regex=False)\
                           .str.extract('(\d+)').astype(float)



  .str.extract('(\d+)').astype(float)


In [12]:
class GPUFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['GPU_MEMORY'] = df['GPU'].str.extract(r'(\d+)\s*GB', expand=False).fillna(0).astype(int)
        return df[['GPU_MEMORY','GPU']]

In [13]:

class RAMExpandableExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure input is always a Series (single column)
        if isinstance(X, pd.DataFrame):
            series = X.iloc[:, 0]
        else:
            series = X  # already a Series

        extracted = series.astype(str).str.extract(r'(\d+)')[0].fillna(0).astype(int)
        return pd.DataFrame(extracted)
    

In [14]:

class RAMTypeCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Handle input as a Series (whether X is a DataFrame or a string)
        if isinstance(X, pd.DataFrame):
            series = X.iloc[:, 0]
        else:
            series = pd.Series([X]) if isinstance(X, str) else X

        cleaned = (
            series.astype(str)
            .str.replace(r'\s*RAM$', '', regex=True)
            .str.strip()
        )

        return pd.DataFrame(cleaned)


In [15]:

class GhzExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Always operate on the first column as a Series
        if isinstance(X, pd.DataFrame):
            series = X.iloc[:, 0]
        else:
            series = pd.Series(X)

        # Extract numeric part (e.g., 4.2 from "4.2 Ghz")
        ghz = (
            series.astype(str)
            .str.extract(r'([\d.]+)')
            .fillna(0)
            .astype(float)
        )

        return pd.DataFrame(ghz)


In [16]:

class GPUSeriesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, top_n=10):
        self.top_n = top_n
        self.top_series = None

    def fit(self, X, y=None):
        # Always extract from first column
        series = X.iloc[:, 0].astype(str).str.extract(
            r'(iris xe|uhd \d*|uhd|radeon(?: [a-z0-9]+)*|geforce rtx \d{3,4}|geforce gtx \d{3,4}|geforce mx\d+|hd \d+|integrated)',
            flags=re.IGNORECASE,
            expand=False
        ).fillna('other').str.lower().str.strip()

        self.top_series = series.value_counts().head(self.top_n).index.tolist()
        return self

    def transform(self, X):
        series = X.iloc[:, 0].astype(str).str.extract(
            r'(iris xe|uhd \d*|uhd|radeon(?: [a-z0-9]+)*|geforce rtx \d{3,4}|geforce gtx \d{3,4}|geforce mx\d+|hd \d+|integrated)',
            flags=re.IGNORECASE,
            expand=False
        ).fillna('other').str.lower().str.strip()

        simplified = series.apply(lambda x: x if x in self.top_series else 'other')
        return simplified.to_frame()


In [17]:

class StorageSizeExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def convert(value):
            value = str(value).lower()

            if 'tb' in value:
                num = re.search(r'(\d+(\.\d+)?)', value)
                return int(float(num.group(0)) * 1024) if num else 0
            elif 'gb' in value:
                num = re.search(r'(\d+)', value)
                return int(num.group(0)) if num else 0
            elif 'no' in value or value.strip() in ['none', 'nan', 'null']:
                return 0
            elif value.isdigit():
                return int(value)
            else:
                return 0

        # Ensure it's a DataFrame
        if isinstance(X, pd.Series):
            X = X.to_frame()

        return X.applymap(convert)

In [18]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('ram_cleaner', CleanRam(), ['RAM']),
        ('processor_cleaner', ProcessorFeaturesExtractor(), ['Processor_Name']),
        ('Gpu_memory_extractor', GPUFeatureExtractor(), ['GPU']),
        ('Ram_expandable', RAMExpandableExtractor(), ['RAM_Expandable']),
        ('Ram_type_cleaner', RAMTypeCleaner(), ['RAM_TYPE']),
        ('Ghz_number', GhzExtractor(), ['Ghz']),
        ('Gpu_series', GPUSeriesExtractor(), ['GPU']),
        ('ssd_cleaner', StorageSizeExtractor(), ['SSD']),
        ('hdd_cleaner', StorageSizeExtractor(), ['HDD']),
    ],
    remainder='passthrough'
)


In [19]:
X = df.drop('Price' , axis=1)
Y = df['Price']


In [20]:
print(X.shape)
print(Y.shape)

(3976, 13)
(3976,)


In [21]:
x_train ,x_test , y_train , y_test = train_test_split(X , Y , test_size=0.2 , random_state=22)

In [22]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 796 entries, 1574 to 905
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Brand           796 non-null    object 
 1   Processor_Name  796 non-null    object 
 2   RAM_Expandable  796 non-null    object 
 3   RAM             796 non-null    object 
 4   RAM_TYPE        796 non-null    object 
 5   Ghz             796 non-null    object 
 6   Display_type    796 non-null    object 
 7   Display         796 non-null    float64
 8   GPU             796 non-null    object 
 9   GPU_Brand       796 non-null    object 
 10  SSD             796 non-null    object 
 11  HDD             796 non-null    object 
 12  Adapter         796 non-null    float64
dtypes: float64(2), object(11)
memory usage: 87.1+ KB


In [23]:
new_row.info()
new_row_df = pd.DataFrame(new_row) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 1 to 1
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       1 non-null      int64 
 1   Brand            1 non-null      object
 2   Name             1 non-null      object
 3   Processor_Name   1 non-null      object
 4   Processor_Brand  1 non-null      object
 5   RAM_Expandable   1 non-null      object
 6   RAM              1 non-null      object
 7   RAM_TYPE         1 non-null      object
 8   Ghz              1 non-null      object
 9   Display_type     1 non-null      object
 10  Display          1 non-null      object
 11  GPU              1 non-null      object
 12  GPU_Brand        1 non-null      object
 13  SSD              1 non-null      object
 14  HDD              1 non-null      object
 15  Adapter          1 non-null      object
 16  Battery_Life     1 non-null      object
dtypes: int64(1), object(16)
memory usage: 2

In [24]:
X_transformed = preprocessor.fit_transform(x_train)
pickle.dump(preprocessor , open('../models/preprocessor' , 'wb'))
X_test_transformed = preprocessor.transform(x_test)
new_row_transformed = preprocessor.transform(new_row)


  return X.applymap(convert)
  return X.applymap(convert)
  return X.applymap(convert)
  return X.applymap(convert)
  df['GPU_MEMORY'] = df['GPU'].str.extract(r'(\d+)\s*GB', expand=False).fillna(0).astype(int)
  return X.applymap(convert)
  return X.applymap(convert)


In [25]:
print(X_transformed.shape)
print(X_test_transformed.shape)
print(new_row_transformed.shape)

(3180, 20)
(796, 20)
(1, 20)


In [26]:
new_row_t_df = pd.DataFrame(new_row_transformed)

In [27]:
# Step 1: Convert to DataFrame
df_transformed = pd.DataFrame(X_transformed)
df_test_transformed = pd.DataFrame(X_test_transformed)

# Step 2: Show all rows and columns (temporarily override display settings)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_transformed.sample(5))

        0      1        2   3  4  5     6  7                      8   9   \
2472   8.0  intel  core i5  12  0  0  none  0                Iris Xe  12   
2110  16.0  intel  core i5  13  0  0  none  0                    UHD   0   
422   16.0    amd  ryzen 9   0  8  0  none  6  Radeon 680M GPU, 6 GB   0   
2030   4.0  intel  pentium   0  4  0  none  0                     HD   8   
3053   4.0  intel  core i3   6  0  0  none  0                 HD 520   0   

            10   11       12    13    14    15   16    17     18         19  
2472  DDR4 RAM  1.3  iris xe   512     0  ASUS  LCD  15.6  Intel       45.0  
2110      DDR4  3.4      uhd   512     0    HP  LCD  14.0  Intel  64.599598  
422       DDR5  4.8    other  1024     0  ASUS  LCD  16.0    AMD      240.0  
2030  DDR3 RAM  1.6    other     0   500  Acer  LED  11.6  Intel       45.0  
3053      DDR4  2.0    other     0  1024    HP  LED  15.6  Intel       65.0  


In [28]:
X_test_transformed.shape

(796, 20)

In [29]:
X_transformed.shape

(3180, 20)

In [30]:
# Step 1: Create a column transformer for encoding only categorical features
categorical_indices = [1, 2, 6, 8, 10, 12, 15, 16, 18]  # Based on your table
categorical_encoder = ColumnTransformer(
    
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_indices)
    ],
    remainder='passthrough'  # Keep all other columns
)

# Step 2: Apply it to the already transformed dataset
X_final = categorical_encoder.fit_transform(df_transformed)
pickle.dump(categorical_encoder , open('../models/cat_en' , 'wb'))
X_test_final = categorical_encoder.transform(df_test_transformed)
new_row_final  = categorical_encoder.transform(new_row_t_df)

In [40]:
X_final.shape

(3180, 379)

In [32]:
X_test_final.shape

(796, 379)

In [33]:
new_row_final.shape

(1, 379)

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  r2_score

# 1. Train the model
model = LinearRegression()
model.fit(X_final, y_train)
pickle.dump(model , open('../models/model' , 'wb'))

# 2. Predict on test set
y_pred = model.predict(X_test_final)

# 3. Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f"R² Score: {r2:.2f}")


R² Score: 0.76


In [35]:
y_pred = model.predict(new_row_final)
print(y_pred)

[38920.95738893]
