In [9]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [10]:

# load data
train_df = pd.read_csv("train.csv")

# map target
train_df["price"] = train_df["price"].map({"non-expensive": 0, "expensive": 1})

# features/target
X = train_df.drop(["price"], axis=1)
y = train_df["price"]



In [11]:

# column groups
numerical_cols = [
    "rating", "Core_Count", "Clock_Speed_GHz", "RAM Size GB",
    "Storage Size GB", "battery_capacity",
    "Screen_Size", "Resolution_Width", "Resolution_Height",
    "Refresh_Rate", "primary_rear_camera_mp", "num_rear_cameras",
    "primary_front_camera_mp", "num_front_cameras"
]

binary_cols = ["Dual_Sim", "4G", "5G", "Vo5G", "NFC", "IR_Blaster", "memory_card_support"]

categorical_cols = [
    "Processor_Brand", "Performance_Tier", "RAM Tier",
    "Notch_Type", "os_name", "os_version", "brand",
     "Processor_Series", "memory_card_size"
]

for col in binary_cols:
    X[col] = X[col].map({"Yes": 1, "No": 0})


In [12]:

# preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("bin", OrdinalEncoder(), binary_cols)
    ]  
)

In [13]:

# split train/validation (learning only)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90, stratify=y)


In [14]:

#pipelines for all models
DT_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=90)),
    ]
)
   
LogR_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ]
)     

SVC_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", SVC(kernel='rbf', C=1.0, gamma='scale',random_state=90)),
    ]
) 
     
# train
DT_model.fit(X_train, y_train)
LogR_model.fit(X_train, y_train)
SVC_model.fit(X_train, y_train)


In [15]:
# 2. EVALUATE ON test_data.csv
test_df = pd.read_csv("test.csv")
# Assuming test_data.csv has 'price' column
X_test_final = test_df.drop(["price"], axis=1)
y_test_final = test_df["price"].map({"non-expensive": 0, "expensive": 1})

# Apply same preprocessing for binary columns
for col in binary_cols:
    X_test_final[col] = X_test_final[col].map({"Yes": 1, "No": 0})


In [16]:

# Predict
y_pred_final = DT_model.predict(X_test_final)
print("Tree Test Set Accuracy:", accuracy_score(y_test_final, y_pred_final))
print(classification_report(y_test_final, y_pred_final))
joblib.dump(DT_model, "Models")
print("Model Saved✅")

# Predict
y_pred_final = LogR_model.predict(X_test_final)
print("Logistic Regression Test Set Accuracy:", accuracy_score(y_test_final, y_pred_final))
print(classification_report(y_test_final, y_pred_final))
joblib.dump(LogR_model, "Models")
print("Model Saved✅")

# Predict
y_pred_final = SVC_model.predict(X_test_final)
print("SVM Test Set Accuracy:", accuracy_score(y_test_final, y_pred_final))
print(classification_report(y_test_final, y_pred_final))
joblib.dump(SVC_model, "Models")
print("Model Saved✅")


Tree Test Set Accuracy: 0.9477124183006536
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       110
           1       0.89      0.93      0.91        43

    accuracy                           0.95       153
   macro avg       0.93      0.94      0.94       153
weighted avg       0.95      0.95      0.95       153

Model Saved✅
Logistic Regression Test Set Accuracy: 0.9281045751633987
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       110
           1       0.86      0.88      0.87        43

    accuracy                           0.93       153
   macro avg       0.91      0.91      0.91       153
weighted avg       0.93      0.93      0.93       153

Model Saved✅
SVM Test Set Accuracy: 0.9019607843137255
              precision    recall  f1-score   support

           0       0.93      0.94      0.93       110
           1       0.83      0.81      0.82        43

    accuracy 