# Regression analysis

## Libraries and settings

In [1]:
# Libraries
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/workspaces/data_analytics/WTYK/LC_08


## Import car data

In [2]:
# Columns
columns = ['Offer_Id', 
           'Price_num', 
           'Mileage_num',
           'HP_num', 
           'Make',
           'Fuel_Type', 
           'Transmission', 
           'Init_Regist_MY']

# Read the data
df_orig = pd.read_csv('autoscout24_data_prepared.csv', 
                      sep=",", 
                      encoding='utf-8',
                      usecols=columns)

# Rename variable 'web-scraper-order' to 'apmt_id'
df_orig = df_orig.rename(columns={'Mileage_num': 'Mileage',
                                  'HP_num': 'HP',
                                  'Price_num': 'Price'})

# Extract year from 'Init_Regist_MY'
df_orig['Init_Regist_MY'] = df_orig['Init_Regist_MY'].astype(str)
df_orig['Year'] = df_orig['Init_Regist_MY'].str.split('.').str[1].astype(int)
df_orig.drop('Init_Regist_MY', axis=1, inplace=True)

# Remove missing values
df = df_orig.dropna()
df.head(5)

# Remove duplicates
df = df.drop_duplicates()
df.head(5)

# Remove some extreme prices; keep only fuel type 'Diesel' and 'Benzin'
df = df.loc[(df['Price'] >= 10000) & (df['Price'] <= 100000) & (df['Fuel_Type'].isin(['Diesel','Benzin']))]

# Show dimensions (rows and columns)
print(df.shape)

# Show the data
df.head()


(2845, 8)


Unnamed: 0,Offer_Id,Fuel_Type,Transmission,Make,Mileage,HP,Price,Year
0,7324420,Diesel,Automatisiertes Schaltgetriebe,AUDI,75000,245,22500,2014
1,7512768,Benzin,Automat sequentiell,MERCEDES-BENZ,46655,184,23749,2013
2,7512034,Benzin,Automat sequentiell,MERCEDES-BENZ,138955,306,18500,2011
3,7512728,Benzin,Automatisiertes Schaltgetriebe,MERCEDES-BENZ,43000,360,36000,2015
4,7490242,Benzin,Automatisiertes Schaltgetriebe,AUDI,43300,252,48500,2018


## One-hot encoding of categorical variables

In [3]:
# Perform one-hot encoding on the 'Fuel_Type' column using the get_dummies() function


# Concatenate the new one-hot encoded columns with the original DataFrame


## Create train and test samples (train = 80%, test = 20% of the data)

In [4]:
# Create train and test samples


# Show X_train


# Show y_train


## Multiple linear regression

In [5]:
# Add constant to the model


# Create the multiple regression model using the Ordinary Least Squares (OLS) method


# Print full model output


### Plot histogram of multiple linear regression residuals

In [6]:
# Plot histogram of residuals


# Set axes labels and plot title


# Show the plot


## Random forest regression

### Fit Random forest regression model

In [7]:
# Initialize the random forest regressor



# Fit the model



# Calculate coefficient of determination (R-squared), round to 4 decimal places with f-string formatting


### Plot histogram of random forest regression residuals

In [8]:
# Calculate residuals


# Calculate residuals by subtracting actual target values from predicted values


# Plot histogram of residuals



### Show feature importance

In [9]:
# Get column names of X_train


# Derive feature importance from random forest


# Print col-names and importances-values


# Barplot with feature importance


### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [10]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-13 10:14:51
Python Version: 3.11.10
-----------------------------------
