# Regression analysis

## Libraries and settings

In [59]:
# Libraries
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/Users/sauternicolas/git/data_analytics/WTYK/LC_08


## Import data

In [60]:
# Columns
columns = ['Offer_Id', 
           'Price_num', 
           'Mileage_num', 
           'HP_num', 
           'Make', 
           'Fuel_Type', 
           'Transmission', 
           'Init_Regist_MY']

# Read the data
df_orig = pd.read_csv('autoscout24_data_prepared.csv', 
                      sep=",", 
                      encoding='utf-8',
                      usecols=columns)

# Rename variable 'web-scraper-order' to 'apmt_id'
df_orig = df_orig.rename(columns={'Mileage_num': 'Mileage',
                                  'HP_num': 'HP',
                                  'Price_num': 'Price'})

# Extract year
df_orig['Init_Regist_MY'] = df_orig['Init_Regist_MY'].astype(str)
df_orig['Year'] = df_orig['Init_Regist_MY'].str.split('.').str[1].astype(int)
df_orig.drop('Init_Regist_MY', axis=1, inplace=True)

# Remove missing values
df = df_orig.dropna()
df.head(5)

# Remove duplicates
df = df.drop_duplicates()
df.head(5)

# Remove some extreme prices; keep only fuel type 'Diesel' and 'Benzin'
df = df.loc[(df['Price'] >= 10000) & (df['Price'] <= 100000) & (df['Fuel_Type'].isin(['Diesel','Benzin']))]
df


Unnamed: 0,Offer_Id,Fuel_Type,Transmission,Make,Mileage,HP,Price,Year
0,7324420,Diesel,Automatisiertes Schaltgetriebe,AUDI,75000,245,22500,2014
1,7512768,Benzin,Automat sequentiell,MERCEDES-BENZ,46655,184,23749,2013
2,7512034,Benzin,Automat sequentiell,MERCEDES-BENZ,138955,306,18500,2011
3,7512728,Benzin,Automatisiertes Schaltgetriebe,MERCEDES-BENZ,43000,360,36000,2015
4,7490242,Benzin,Automatisiertes Schaltgetriebe,AUDI,43300,252,48500,2018
...,...,...,...,...,...,...,...,...
3964,6058456,Benzin,Automat,BENTLEY,26200,405,38500,2001
3966,7137137,Benzin,Automatisiertes Schaltgetriebe,FERRARI,78000,400,58400,2002
3967,6758841,Benzin,Automatik-Getriebe,MERCEDES-BENZ,315000,232,15900,1987
3968,7461330,Benzin,Automat sequentiell,MERCEDES-BENZ,85000,525,42000,2007


## One-hot encoding of categorical variables

In [61]:
# Perform one-hot encoding
one_hot_enconded = pd.get_dummies(df, columns=['Make', 'Fuel_Type', 'Transmission'])

# Concatenate the new one-hot encoded columns with the original DataFrame
one_hot_enconded = pd.concat([df, one_hot_enconded], axis=1)
one_hot_enconded

Unnamed: 0,Offer_Id,Fuel_Type,Transmission,Make,Mileage,HP,Price,Year,Offer_Id.1,Mileage.1,...,Transmission_Automat,Transmission_Automat sequentiell,Transmission_Automat stufenlos,"Transmission_Automat stufenlos, sequentiell",Transmission_Automatik-Getriebe,Transmission_Automatisiertes Schaltgetriebe,Transmission_Hinterradantrieb,Transmission_Schaltgetriebe,Transmission_Schaltgetriebe manuell,Transmission_Schaltgetriebe sequentiell
0,7324420,Diesel,Automatisiertes Schaltgetriebe,AUDI,75000,245,22500,2014,7324420,75000,...,False,False,False,False,False,True,False,False,False,False
1,7512768,Benzin,Automat sequentiell,MERCEDES-BENZ,46655,184,23749,2013,7512768,46655,...,False,True,False,False,False,False,False,False,False,False
2,7512034,Benzin,Automat sequentiell,MERCEDES-BENZ,138955,306,18500,2011,7512034,138955,...,False,True,False,False,False,False,False,False,False,False
3,7512728,Benzin,Automatisiertes Schaltgetriebe,MERCEDES-BENZ,43000,360,36000,2015,7512728,43000,...,False,False,False,False,False,True,False,False,False,False
4,7490242,Benzin,Automatisiertes Schaltgetriebe,AUDI,43300,252,48500,2018,7490242,43300,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3964,6058456,Benzin,Automat,BENTLEY,26200,405,38500,2001,6058456,26200,...,True,False,False,False,False,False,False,False,False,False
3966,7137137,Benzin,Automatisiertes Schaltgetriebe,FERRARI,78000,400,58400,2002,7137137,78000,...,False,False,False,False,False,True,False,False,False,False
3967,6758841,Benzin,Automatik-Getriebe,MERCEDES-BENZ,315000,232,15900,1987,6758841,315000,...,False,False,False,False,True,False,False,False,False,False
3968,7461330,Benzin,Automat sequentiell,MERCEDES-BENZ,85000,525,42000,2007,7461330,85000,...,False,True,False,False,False,False,False,False,False,False


## Create train and test samples (train = 80%, test = 20% of the data)

In [62]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(one_hot_enconded.drop(['Price', "HP", "Year", "Bezin"], axis=1), 
                                                    one_hot_enconded['Price'], 
                                                    test_size=0.20, 
                                                    random_state=42)

# Show X_train
print(X_train)
print(X_train.head(), '\n')

# Show y_train
print("y_train")
print(y_train.head())

KeyError: "['Price' 'HP' 'Year' 'Bezin'] not found in axis"

## Multiple linear regression

In [None]:
# Add constant to the model
x_train_const = sm.add_constant(X_train)

# Create the multiple regression model
olsmod = sm.OLS(y_train, x_train_const)
olsres = olsmod.fit()

# Prinprint the summary
print(olsres.summary())

### Plot histogram of multiple linear regression residuals

In [None]:
# Plot histogram of residuals


# Set axes labels and title


# Show the plot


## Random forest regression

### Fit Random forest regression model

In [None]:
# Initialize rand forest regressor


# Fit the model


# Calculate coefficient of determination (R-squared)


### Plot histogram of random forest regression residuals

In [None]:
# Calculate predictions from random forest model based on training data



# Calculate residuals by claculating the difference between actual and predicted values



# Plot histogram of residuals


# Set labels


### Show feature importance

In [None]:
# Derive columns from X_train


# Derive feature importance from random forest


# Print col-names and importances-values


# Barplot with feature importance


### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 24.0.0
Datetime: 2024-11-04 17:18:06
Python Version: 3.10.14
-----------------------------------
