# Jupyter for working with cars dataset

In [22]:
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


## Describe and analyze data

In [2]:
data = pd.read_csv("car_price_prediction.csv")

In [3]:
data.head()

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color
0,13328,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,4,Left wheel,Silver
1,16621,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,4,Left wheel,Black
2,8467,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,4,Right-hand drive,Black
3,3607,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,4,Left wheel,White
4,11726,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,4,Left wheel,Silver


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Manufacturer      19237 non-null  object 
 2   Model             19237 non-null  object 
 3   Prod. year        19237 non-null  int64  
 4   Category          19237 non-null  object 
 5   Leather interior  19237 non-null  object 
 6   Fuel type         19237 non-null  object 
 7   Engine volume     19237 non-null  object 
 8   Mileage           19237 non-null  int64  
 9   Cylinders         19237 non-null  float64
 10  Gear box type     19237 non-null  object 
 11  Drive wheels      19237 non-null  object 
 12  Doors             19237 non-null  int64  
 13  Wheel             19237 non-null  object 
 14  Color             19237 non-null  object 
dtypes: float64(1), int64(4), object(10)
memory usage: 2.2+ MB


In [5]:
data.describe()

Unnamed: 0,Price,Prod. year,Mileage,Cylinders,Doors
count,19237.0,19237.0,19237.0,19237.0,19237.0
mean,18555.93,2010.912824,1532236.0,4.582991,3.925872
std,190581.3,5.668673,48403870.0,1.199933,0.403399
min,1.0,1939.0,0.0,1.0,2.0
25%,5331.0,2009.0,70139.0,4.0,4.0
50%,13172.0,2012.0,126000.0,4.0,4.0
75%,22075.0,2015.0,188888.0,4.0,4.0
max,26307500.0,2020.0,2147484000.0,16.0,5.0


In [6]:
data.columns

Index(['Price', 'Manufacturer', 'Model', 'Prod. year', 'Category',
       'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',
       'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel',
       'Color'],
      dtype='object')

In [7]:
data.shape

(19237, 15)

In [8]:
label_encoder = LabelEncoder()
# data['Manufacturer_encoded'] = label_encoder.fit_transform(data['Manufacturer'])
data

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color
0,13328,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,4,Left wheel,Silver
1,16621,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000,6.0,Tiptronic,4x4,4,Left wheel,Black
2,8467,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,4,Right-hand drive,Black
3,3607,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,4,Left wheel,White
4,11726,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,4,Left wheel,Silver
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000,4.0,Manual,Rear,2,Left wheel,Silver
19233,15681,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600,4.0,Tiptronic,Front,4,Left wheel,Red
19234,26108,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365,4.0,Automatic,Front,4,Left wheel,Grey
19235,5331,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258,4.0,Automatic,Front,4,Left wheel,Black


In [9]:
# dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# label_encoder.classes_

In [10]:
data

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color
0,13328,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,4,Left wheel,Silver
1,16621,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000,6.0,Tiptronic,4x4,4,Left wheel,Black
2,8467,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,4,Right-hand drive,Black
3,3607,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,4,Left wheel,White
4,11726,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,4,Left wheel,Silver
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000,4.0,Manual,Rear,2,Left wheel,Silver
19233,15681,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600,4.0,Tiptronic,Front,4,Left wheel,Red
19234,26108,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365,4.0,Automatic,Front,4,Left wheel,Grey
19235,5331,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258,4.0,Automatic,Front,4,Left wheel,Black


In [14]:
label_encoder = LabelEncoder()
# data['Engine volume_new'] = label_encoder.fit_transform(data['Engine volume'])
# dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

train_frame = pd.DataFrame()

transform_dict = {"Manufacturer" : {},
                  "Category" : {},
                  "Leather interior" : {},
                  "Fuel type" : {},
                  "Engine volume" : {},
                  "Gear box type" : {},
                  "Drive wheels" : {},
                  "Wheel" : {},
                  "Color" : {}}



for key in transform_dict:
    label_encoder = LabelEncoder()
    train_frame[key] = label_encoder.fit_transform(data[key])
    transform_dict[key] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    

for column in data.columns:
    if column not in train_frame.columns and column != "Model":
        train_frame[column] = data[column]

In [15]:
train_frame

Unnamed: 0,Manufacturer,Category,Leather interior,Fuel type,Engine volume,Gear box type,Drive wheels,Wheel,Color,Price,Prod. year,Mileage,Cylinders,Doors
0,32,4,1,2,63,0,0,0,12,13328,2010,186005,6.0,4
1,8,4,0,5,56,2,0,0,1,16621,2011,192000,6.0,4
2,21,3,0,5,22,3,1,1,1,8467,2006,200000,4.0,4
3,16,4,1,2,46,0,0,0,14,3607,2011,168966,4.0,4
4,21,3,1,5,22,0,1,0,12,11726,2014,91901,4.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,36,1,1,0,37,1,2,0,12,8467,1999,300000,4.0,2
19233,23,9,1,5,44,2,1,0,11,15681,2011,161600,4.0,4
19234,23,4,1,1,36,0,1,0,7,26108,2010,116365,4.0,4
19235,8,4,1,1,36,0,1,0,1,5331,2007,51258,4.0,4


In [23]:
X = train_frame.drop(['Price', 'Manufacturer'], axis=1)  # X contains all columns except 'Price'
y = train_frame['Price']  # y contains only the 'Price' column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print('Mean Squared Error:', mse)
print('R-squared:', r2)

Mean Squared Error: 361625448.0923238
R-squared: -0.16055640867980636
