In [300]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re

from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.svm import SVR

from sklearn.metrics import r2_score,mean_absolute_error

In [218]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,brand,name,price,spec_rating,processor,CPU,Ram,Ram_type,ROM,ROM_type,GPU,display_size,resolution_width,resolution_height,OS,warranty
0,0,0,HP,Victus 15-fb0157AX Gaming Laptop,49900,73.0,5th Gen AMD Ryzen 5 5600H,"Hexa Core, 12 Threads",8GB,DDR4,512GB,SSD,4GB AMD Radeon RX 6500M,15.6,1920.0,1080.0,Windows 11 OS,1
1,1,1,HP,15s-fq5007TU Laptop,39900,60.0,12th Gen Intel Core i3 1215U,"Hexa Core (2P + 4E), 8 Threads",8GB,DDR4,512GB,SSD,Intel UHD Graphics,15.6,1920.0,1080.0,Windows 11 OS,1
2,2,2,Acer,One 14 Z8-415 Laptop,26990,69.323529,11th Gen Intel Core i3 1115G4,"Dual Core, 4 Threads",8GB,DDR4,512GB,SSD,Intel Iris Xe Graphics,14.0,1920.0,1080.0,Windows 11 OS,1
3,3,3,Lenovo,Yoga Slim 6 14IAP8 82WU0095IN Laptop,59729,66.0,12th Gen Intel Core i5 1240P,"12 Cores (4P + 8E), 16 Threads",16GB,LPDDR5,512GB,SSD,Intel Integrated Iris Xe,14.0,2240.0,1400.0,Windows 11 OS,1
4,4,4,Apple,MacBook Air 2020 MGND3HN Laptop,69990,69.323529,Apple M1,Octa Core (4P + 4E),8GB,DDR4,256GB,SSD,Apple M1 Integrated Graphics,13.3,2560.0,1600.0,Mac OS,1


In [219]:
df = df.drop(columns=['Unnamed: 0.1','Unnamed: 0','name'])

In [220]:
df.columns = ['brand','price','spec_rating','processor','cpu','ram','ram_type','rom','rom_type','gpu','display_size',
'resolution_width','resolution_height','os','warranty'
]

In [221]:
df['ram'] = df['ram'].str.replace('GB','')
df['rom'] = df['rom'].str.replace('GB','')
df['rom'] = df['rom'].str.replace('1TB','1024')
df['rom'] = df['rom'].str.replace('2TB','2048')
df['ram'] = df['ram'].astype(int)
df['rom'] = df['rom'].astype(int)
df['spec_rating'] = df['spec_rating'].round(2)

In [222]:
df.head()

Unnamed: 0,brand,price,spec_rating,processor,cpu,ram,ram_type,rom,rom_type,gpu,display_size,resolution_width,resolution_height,os,warranty
0,HP,49900,73.0,5th Gen AMD Ryzen 5 5600H,"Hexa Core, 12 Threads",8,DDR4,512,SSD,4GB AMD Radeon RX 6500M,15.6,1920.0,1080.0,Windows 11 OS,1
1,HP,39900,60.0,12th Gen Intel Core i3 1215U,"Hexa Core (2P + 4E), 8 Threads",8,DDR4,512,SSD,Intel UHD Graphics,15.6,1920.0,1080.0,Windows 11 OS,1
2,Acer,26990,69.32,11th Gen Intel Core i3 1115G4,"Dual Core, 4 Threads",8,DDR4,512,SSD,Intel Iris Xe Graphics,14.0,1920.0,1080.0,Windows 11 OS,1
3,Lenovo,59729,66.0,12th Gen Intel Core i5 1240P,"12 Cores (4P + 8E), 16 Threads",16,LPDDR5,512,SSD,Intel Integrated Iris Xe,14.0,2240.0,1400.0,Windows 11 OS,1
4,Apple,69990,69.32,Apple M1,Octa Core (4P + 4E),8,DDR4,256,SSD,Apple M1 Integrated Graphics,13.3,2560.0,1600.0,Mac OS,1


In [223]:
def parse_processor(p):
    p = str(p).upper()

    # Brand
    if 'INTEL' in p:
        brand = 'Intel'
    elif 'AMD' in p or 'RYZEN' in p:
        brand = 'AMD'
    elif 'APPLE' in p or 'M1' in p or 'M2' in p or 'M3' in p:
        brand = 'Apple'
    elif 'SNAPDRAGON' in p or 'QUALCOMM' in p:
        brand = 'Qualcomm'
    else:
        brand = 'Other'

    # Tier
    if 'I3' in p: tier = 'i3'
    elif 'I5' in p: tier = 'i5'
    elif 'I7' in p: tier = 'i7'
    elif 'I9' in p: tier = 'i9'
    elif 'RYZEN 3' in p: tier = 'R3'
    elif 'RYZEN 5' in p: tier = 'R5'
    elif 'RYZEN 7' in p: tier = 'R7'
    elif 'RYZEN 9' in p: tier = 'R9'
    elif 'CELERON' in p or 'PENTIUM' in p or 'ATHLON' in p:
        tier = 'Low'
    elif 'M1' in p: tier = 'M1'
    elif 'M2' in p: tier = 'M2'
    elif 'M3' in p: tier = 'M3'
    else:
        tier = 'Other'

    # Generation
    gen = np.nan

    # Intel 12th Gen
    m = re.search(r'(\d{1,2})TH GEN', p)
    if m:
        gen = int(m.group(1))

    # Intel i5-12450H
    m2 = re.search(r'I[3579]\s*[- ]?(\d{3,4})', p)
    if pd.isna(gen) and m2:
        gen = int(m2.group(1)[:2])

    # Ryzen 5600, 7640
    m3 = re.search(r'RYZEN\s*\d\s*(\d{4})', p)
    if pd.isna(gen) and m3:
        gen = int(m3.group(1)[0])

    # Apple M1/M2/M3
    m4 = re.search(r'\bM(\d)\b', p)
    if m4:
        gen = int(m4.group(1))

    # Low-end CPUs: force gen = 0
    if tier == 'Low':
        gen = 0

    return brand, tier, gen

df[['proc_brand','proc_tier','proc_gen']] = (
    df['processor'].apply(lambda x: pd.Series(parse_processor(x)))
)

In [224]:
def parse_gpu(g):
    g = str(g).upper()
    
    if 'NVIDIA' in g: brand = 'NVIDIA'
    elif 'AMD' in g or 'RADEON' in g: brand = 'AMD'
    elif 'INTEL' in g: brand = 'Intel'
    else: brand = 'Other'
    
    if 'RTX' in g: tier = 'RTX'
    elif 'GTX' in g: tier = 'GTX'
    elif 'MX' in g: tier = 'MX'
    elif 'INTEGRATED' in g or 'UHD' in g or 'IRIS' in g: tier = 'Integrated'
    else: tier = 'Other'
    
    vram = re.search(r'(\d+)GB', g)
    vram = int(vram.group(1)) if vram else 0
    
    return brand, tier, vram

df[['gpu_brand','gpu_tier','gpu_vram']] = df['gpu'].apply(lambda x: pd.Series(parse_gpu(x)))


In [225]:
def parse_cpu(s):
    s = str(s).upper()

    # Map words to numbers
    word_to_num = {
        "DUAL": 2,
        "QUAD": 4,
        "HEXA": 6,
        "OCTA": 8
    }

    cores = np.nan
    threads = np.nan

    # 1) "10 Cores", "12 Cores", "24 Cores"
    m = re.search(r'(\d+)\s*CORES?', s)
    if m:
        cores = int(m.group(1))

    # 2) "Quad Core", "Hexa Core", "Octa Core", "Dual Core"
    if pd.isna(cores):
        for w,n in word_to_num.items():
            if f"{w} CORE" in s:
                cores = n
                break

    # 3) "8 Threads", "12 Threads", "20 Threads"
    m2 = re.search(r'(\d+)\s*THREADS?', s)
    if m2:
        threads = int(m2.group(1))

    return cores, threads


df[['cpu_cores','cpu_threads']] = (
    df['cpu'].apply(lambda x: pd.Series(parse_cpu(x)))
)



In [226]:
df['ram_type'] = (
    df['ram_type']
    .str.upper()
    .str.replace(r'[^A-Z0-9]', '', regex=True)
    .replace({
        'LPDDR4X':'LPDDR4X',
        'LPDDR4':'LPDDR4',
        'LPDDR5':'LPDDR5',
        'LPDDR5X':'LPDDR5X',
        'DDR4':'DDR4',
        'DDR5':'DDR5',
        'DDR3':'DDR3',
        'DDR':'DDR3'
    })
)


In [227]:
def clean_os(s):
    s = str(s).lower()
    if 'windows' in s: return 'Windows'
    if 'mac' in s: return 'Mac'
    if 'chrome' in s: return 'ChromeOS'
    if 'ubuntu' in s or 'linux' in s: return 'Linux'
    if 'android' in s: return 'Android'
    if 'dos' in s: return 'DOS'
    return 'Other'

df['os_family'] = df['os'].apply(clean_os)


In [228]:
df.drop(columns=['gpu','cpu','processor','os'],inplace=True)

In [229]:
df.dropna(subset=['cpu_threads'],inplace=True)
df.dropna(subset=['proc_gen'],inplace=True)
df.dropna(subset=['cpu_cores'],inplace=True)

In [236]:
brand_counts = df['brand'].value_counts()

# threshold
keep = brand_counts[brand_counts >= 15].index

df['brand'] = df['brand'].apply(lambda x: x if x in keep else 'Other')

df['brand'].value_counts()


brand
HP         186
Lenovo     168
Asus       154
Dell        99
Acer        80
Other       66
MSI         64
Samsung     28
Infinix     15
Name: count, dtype: int64

In [243]:
df.head()

Unnamed: 0,brand,price,spec_rating,ram,ram_type,rom,rom_type,display_size,resolution_width,resolution_height,warranty,proc_brand,proc_tier,proc_gen,gpu_brand,gpu_tier,gpu_vram,cpu_cores,cpu_threads,os_family
0,HP,49900,73.0,8,DDR4,512,SSD,15.6,1920.0,1080.0,1,AMD,R5,5.0,AMD,Other,4,6.0,12.0,Windows
1,HP,39900,60.0,8,DDR4,512,SSD,15.6,1920.0,1080.0,1,Intel,i3,12.0,Intel,Integrated,0,6.0,8.0,Windows
2,Acer,26990,69.32,8,DDR4,512,SSD,14.0,1920.0,1080.0,1,Intel,i3,11.0,Intel,Integrated,0,2.0,4.0,Windows
3,Lenovo,59729,66.0,16,LPDDR5,512,SSD,14.0,2240.0,1400.0,1,Intel,i5,12.0,Intel,Integrated,0,12.0,16.0,Windows
5,Acer,39990,62.0,8,DDR4,512,SSD,14.0,1920.0,1080.0,1,Intel,i5,12.0,Intel,Integrated,0,12.0,16.0,Windows


In [244]:
profile = ProfileReport(df,title='Laptop price prediction',explorative=True)
profile.to_file('laptop.html')

100%|██████████| 20/20 [00:00<00:00, 241.29it/s]<00:00, 42.83it/s, Describe variable: os_family] 
Summarize dataset: 100%|██████████| 150/150 [00:17<00:00,  8.75it/s, Completed]                                  
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  7.97it/s]


In [251]:
X = df.drop(columns=['price'])
y = df['price']

In [252]:
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object','category','bool']).columns

In [255]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=42)

In [271]:
num_pipeline = Pipeline(steps=[('scaler',StandardScaler())])
cat_pipeline = Pipeline(steps=[
    ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

In [272]:
preprocessor = ColumnTransformer(transformers=[
    ('num',num_pipeline,num_cols),
    ('cat',cat_pipeline,cat_cols)
])

In [288]:
model = RandomForestRegressor(
    n_estimators = 300,
    max_depth = None,
    random_state = 42,
    n_jobs = -1
)
pipe = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',model)
])

In [289]:
pipe.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [293]:
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.8951798526889032

In [298]:

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )
}


In [299]:

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    r2 = r2_score(y_test, preds)
    
    results[name] = r2
    print(f"{name}: R2 = {r2:.4f}")


LinearRegression: R2 = 0.8896
Ridge: R2 = 0.8925
RandomForest: R2 = 0.8952
GradientBoosting: R2 = 0.9285
XGBoost: R2 = 0.9253


In [306]:


pipe_xg = Pipeline([
    ("preprocess", preprocessor),
    ("model", XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ))
])

scores = cross_val_score(pipe_xg, X_train, y_train, cv=5, scoring="r2")

print(scores)
print("Mean:", scores.mean())


[0.85488433 0.88703978 0.68836248 0.79291344 0.80581242]
Mean: 0.8058024883270264
