# ML Notebook

In [148]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [149]:
df = pd.read_csv('db/vw/jetta.csv')
df = df.dropna(subset=['year', 'range'])

In [150]:
df_cleaned = df.drop_duplicates()

df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])
df_cleaned['year'] = df_cleaned['year'].astype(int)
df_cleaned['hp'] = df_cleaned['hp'].astype(int)
df_cleaned['range'] = df_cleaned['range'].astype(int)
df_cleaned['kp'] = df_cleaned['kp'].astype(int)
df_cleaned['dtp'] = df_cleaned['dtp'].astype(int)
df_cleaned['subtitle'] = df_cleaned['subtitle'].astype(str)

In [151]:
df_cleaned = df_cleaned[df_cleaned['year'] >= 2004]
df_cleaned.loc[df_cleaned['kp'] == -1, 'kp'] = 0
df_cleaned.loc[df_cleaned['hp'] == -1, 'hp'] = df_cleaned['hp'].mean()

  df_cleaned.loc[df_cleaned['hp'] == -1, 'hp'] = df_cleaned['hp'].mean()


In [152]:
def split_subtitle(subtitle):
  return subtitle.split('•')[0]

df_cleaned['new_subtitle1'] = df_cleaned['subtitle'].apply(split_subtitle)
df_cleaned['new_subtitle1'] = df_cleaned['new_subtitle1'].str.strip()

df_cleaned = df_cleaned[df_cleaned['new_subtitle1'].map(df_cleaned['new_subtitle1'].value_counts()) >= 10]

# Encode the new_subtitle column
label_encoder = LabelEncoder()
df_cleaned['new_subtitle1_encoded'] = label_encoder.fit_transform(df_cleaned['new_subtitle1'])


In [153]:
def split_subtitle(subtitle):
    parts = subtitle.split('•')
    return parts[1] if len(parts) > 1 else ''

df_cleaned['new_subtitle2'] = df_cleaned['subtitle'].apply(split_subtitle)
df_cleaned['new_subtitle2'] = df_cleaned['new_subtitle2'].str.strip()


# Encode the new_subtitle column
df_cleaned['new_subtitle2_encoded'] = label_encoder.fit_transform(df_cleaned['new_subtitle2'])


In [154]:
df_cleaned['mileage_per_year'] = df_cleaned['range'] / (2024 - df_cleaned['year'])

In [155]:
X = df_cleaned[['year', 'range', 'kp', 'hp', 'dtp', 'new_subtitle1_encoded', 'mileage_per_year']]
y = df_cleaned['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train the model
model = RandomForestRegressor(n_estimators=500)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = model.score(X_test, y_test)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Percentage Error: {mape}')
print(f'R^2: {r2}')

Mean Squared Error: 2646701.988554963
Root Mean Squared Error: 1626.8687680802539
Mean Absolute Percentage Error: 12.241516971493914
R^2: 0.6852617344537364


In [156]:
model.feature_importances_

array([0.71171097, 0.09275274, 0.01826847, 0.06728032, 0.01347844,
       0.01667153, 0.07983753])

In [157]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive
cv_scores = -cv_scores

# Calculate mean and standard deviation of the cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f'Cross-Validation Mean Squared Error: {mean_cv_score}')
print(f'Root Cross-Validation Mean Squared Error: {np.sqrt(mean_cv_score)}')

Cross-Validation Mean Squared Error: 2512393.1188795017
Root Cross-Validation Mean Squared Error: 1585.053033459607
