In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score

# Load dataset
data = pd.read_csv("laptopPrice.csv")

# Data Preprocessing
data['ram_gb'] = data['ram_gb'].str.replace('GB', '').astype(int)
data['ssd'] = data['ssd'].str.replace('GB', '').astype(int)
data['hdd'] = data['hdd'].str.replace('GB', '').astype(int)
data['graphic_card_gb'] = data['graphic_card_gb'].str.replace('GB', '').astype(int)

# Extract numeric part from 'rating' column and convert to integer
data['rating'] = data['rating'].str.split().str[0].astype(int)

# Drop columns 
data = data.drop(columns=['weight', 'Touchscreen', 'msoffice'])

# Define features and target variable
X = data.drop(columns=['Price'])
y = data['Price']

# Preprocessing pipelines for numerical and categorical features
numeric_features = ['ram_gb', 'ssd', 'hdd', 'graphic_card_gb', 'Number of Ratings', 'Number of Reviews', 'rating']
categorical_features = ['brand', 'processor_brand', 'processor_name', 'ram_type', 'os', 'os_bit', 'warranty']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipelines
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
rf_pipeline.fit(X_train, y_train)
gb_pipeline.fit(X_train, y_train)

# Predict and evaluate the models
rf_predictions = rf_pipeline.predict(X_test)
gb_predictions = gb_pipeline.predict(X_test)

rf_mae = mean_absolute_error(y_test, rf_predictions)
gb_mae = mean_absolute_error(y_test, gb_predictions)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_predictions))

print(f'Random Forest MAE: {rf_mae}')
print(f'Gradient Boosting MAE: {gb_mae}')

print(f'Random Forest RMSE: {rf_rmse}')
print(f'Gradient Boosting RMSE: {gb_rmse}')

# Cross-validation
def cross_validate_rmse(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return rmse_scores

# Perform cross-validation on the Random Forest model
cv_scores_rf = cross_validate_rmse(rf_pipeline, X, y)
print(f'Random Forest Cross-Validation RMSE Scores: {cv_scores_rf}')
print(f'Mean Random Forest RMSE: {np.mean(cv_scores_rf)}')

# Perform cross-validation on the Gradient Boosting model
cv_scores_gb = cross_validate_rmse(gb_pipeline, X, y)
print(f'Gradient Boosting Cross-Validation RMSE Scores: {cv_scores_gb}')
print(f'Mean Gradient Boosting RMSE: {np.mean(cv_scores_gb)}')


Random Forest MAE: 14160.79245659845
Gradient Boosting MAE: 14393.640719855086
Random Forest RMSE: 24584.912705002534
Gradient Boosting RMSE: 24592.511781977737
Random Forest Cross-Validation RMSE Scores: [25133.87553169773, 19816.92236681857, 20215.304226001856, 27681.00827946222, 21838.608644035357]
Mean Random Forest RMSE: 22937.143809603145
Gradient Boosting Cross-Validation RMSE Scores: [24592.511781977737, 18319.91051517601, 18941.95850493747, 27285.607036852816, 21235.749462926164]
Mean Gradient Boosting RMSE: 22075.14746037404
