In [29]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor



# Load the datafram from CSV
df = pd.read_csv('processed_car_ads.csv')

# Print the dataframe
print(df)

     maker    model  mileage  manufacture_year  engine_displacement  \
0     ford   galaxy   151000            2011.0                 2000   
1    skoda  octavia   143476            2012.0                 2000   
2      bmw      NaN    97676            2010.0                 1995   
3    skoda    fabia   111970            2004.0                 1200   
4    skoda    fabia   128886            2004.0                 1200   
..     ...      ...      ...               ...                  ...   
222   ford    focus   136074            2011.0                 1600   
223   ford   mondeo   137382            2011.0                 2000   
224   ford   fiesta   138000            2004.0                 1398   
225    NaN      NaN   288000            1998.0                 1896   
226    kia      rio    92080            2006.0                 1400   

     engine_power  body_type  color_slug  stk_year transmission  door_count  \
0           103.0        NaN         NaN       NaN          man     

In [30]:
# Convert date strings to datetime objects and extract useful features
for col in ['date_created', 'date_last_seen']:
    df[col] = pd.to_datetime(df[col])
    df[col+'_year'] = df[col].dt.year
    df[col+'_month'] = df[col].dt.month
    df[col+'_day'] = df[col].dt.day
    df[col+'_hour'] = df[col].dt.hour
    df[col+'_minute'] = df[col].dt.minute

# Drop the original date columns
df = df.drop(columns=['date_created', 'date_last_seen'])

# Print the dataframe
print(df)

# Convert all columns to string
df = df.astype(str)

# Convert non-numeric values to numeric using LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])



     maker    model  mileage  manufacture_year  engine_displacement  \
0     ford   galaxy   151000            2011.0                 2000   
1    skoda  octavia   143476            2012.0                 2000   
2      bmw      NaN    97676            2010.0                 1995   
3    skoda    fabia   111970            2004.0                 1200   
4    skoda    fabia   128886            2004.0                 1200   
..     ...      ...      ...               ...                  ...   
222   ford    focus   136074            2011.0                 1600   
223   ford   mondeo   137382            2011.0                 2000   
224   ford   fiesta   138000            2004.0                 1398   
225    NaN      NaN   288000            1998.0                 1896   
226    kia      rio    92080            2006.0                 1400   

     engine_power  body_type  color_slug  stk_year transmission  ...  \
0           103.0        NaN         NaN       NaN          man  ...   
1  

In [31]:
# Split the data into features and target variable
X = df.drop('price_eur', axis=1)
y = df['price_eur']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model and parameters
model = RandomForestRegressor(random_state=42)
parameters = {'n_estimators': [50, 100, 200, 500]}




In [32]:
# Define the GridSearchCV object
grid_obj = GridSearchCV(model, parameters, cv=3, scoring='neg_mean_squared_error')

# Fit the grid search object to the data
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
model = grid_obj.best_estimator_


In [33]:
# Fit the best model to the training data
model.fit(X_train, y_train)

# Make predictions on validation set
predictions = model.predict(X_val)

In [34]:
# Calculate mean squared error
mse = mean_squared_error(y_val, predictions)
print(f'The calculated mean Squared Error: {mse}')

The calculated mean Squared Error: 1258.3428320652174
