In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import time

In [2]:
data = pd.read_csv("cleaned_data.csv")

In [3]:
data.shape

(748211, 975)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748211 entries, 0 to 748210
Columns: 975 entries, mileage to personal_use_only_1.0
dtypes: float64(9), int64(966)
memory usage: 5.4 GB


In [5]:
print("Missing values in dataset:", data.isnull().sum().sum())

Missing values in dataset: 0


In [6]:
X = data.drop(columns=['price'])
y = data['price']

In [None]:
num_bins = 8 


data['binned_label'] = pd.cut(data['price'], bins=num_bins)

data['binned_label'] = data['binned_label'].astype('category')

data['numerical_label'] = data['binned_label'].cat.codes

y = data['numerical_label']

print("Unique values:", len(y.unique()))


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2>Naive Bayes<h2>

In [8]:
from sklearn.naive_bayes import GaussianNB

In [9]:
model = GaussianNB()

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

In [12]:
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

R-squared score: -2.32748395649043


In [13]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.81      0.28      0.42     29146
           1       0.64      0.24      0.35     62533
           2       0.20      0.21      0.20     34680
           3       0.08      0.14      0.10     12885
           4       0.03      0.09      0.05      5222
           5       0.03      0.10      0.05      2376
           6       0.03      0.96      0.06      1132
           7       0.09      0.13      0.11      1669

    accuracy                           0.23    149643
   macro avg       0.24      0.27      0.17    149643
weighted avg       0.48      0.23      0.29    149643



<h2>Logistic Regression<h2>

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
model = LogisticRegression(max_iter=300, verbose=1)

In [16]:
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         7800     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.24469D+06    |proj g|=  3.51687D+05


 This problem is unconstrained.



At iterate   50    f=  4.22833D+05    |proj g|=  4.18047D+03

At iterate  100    f=  3.72877D+05    |proj g|=  1.89415D+03

At iterate  150    f=  3.51321D+05    |proj g|=  1.62601D+03

At iterate  200    f=  3.40747D+05    |proj g|=  8.80257D+02

At iterate  250    f=  3.35490D+05    |proj g|=  3.64743D+03

At iterate  300    f=  3.31697D+05    |proj g|=  6.52786D+02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 7800    300    328      1     0     0   6.528D+02   3.317D+05
  F =   331696.87688449037     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 28.1min finished


In [17]:
y_pred = model.predict(X_test)

In [18]:
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

R-squared score: 0.7763279514758318


In [19]:
report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.88      0.86      0.87     29146
           1       0.84      0.87      0.85     62533
           2       0.71      0.76      0.74     34680
           3       0.57      0.54      0.55     12885
           4       0.52      0.40      0.45      5222
           5       0.45      0.24      0.31      2376
           6       0.46      0.15      0.23      1132
           7       0.59      0.70      0.64      1669

    accuracy                           0.78    149643
   macro avg       0.63      0.56      0.58    149643
weighted avg       0.77      0.78      0.77    149643

