### The goal is to focus on the Driving Style output

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data preprocessing

In [3]:
# load the files
opel1=pd.read_csv('ML Assignment1/opel_corsa_01.csv',delimiter=';')
opel2=pd.read_csv('ML Assignment1/opel_corsa_02.csv',delimiter=';')
peugeot1=pd.read_csv('ML Assignment1/peugeot_207_01.csv',delimiter=';')
peugeot2=pd.read_csv('ML Assignment1/peugeot_207_02.csv',delimiter=';')

In [4]:
# explore basic information
opel1.info()
print('\n')
opel2.info()
print('\n')
peugeot1.info()
print('\n')
peugeot2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7038 entries, 0 to 7037
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 7038 non-null   int64  
 1   AltitudeVariation          7038 non-null   float64
 2   VehicleSpeedInstantaneous  7038 non-null   float64
 3   VehicleSpeedAverage        7038 non-null   float64
 4   VehicleSpeedVariance       7038 non-null   float64
 5   VehicleSpeedVariation      7038 non-null   float64
 6   LongitudinalAcceleration   7038 non-null   float64
 7   EngineLoad                 7038 non-null   float64
 8   EngineCoolantTemperature   7038 non-null   int64  
 9   ManifoldAbsolutePressure   7038 non-null   int64  
 10  EngineRPM                  7038 non-null   int64  
 11  MassAirFlow                7038 non-null   float64
 12  IntakeAirTemperature       7038 non-null   int64  
 13  VerticalAcceleration       7038 non-null   float

In [5]:
# check if datasets are imbalanced for the Driving Style output
balance_opel1 = opel1['drivingStyle'].value_counts()
print(balance_opel1, '\n')

balance_opel2 = opel2['drivingStyle'].value_counts()
print(balance_opel2,'\n')

balance_peugeot1 = peugeot1['drivingStyle'].value_counts()
print(balance_peugeot1,'\n')

balance_peugeot2 = peugeot2['drivingStyle'].value_counts()
print(balance_peugeot2,'\n')

drivingStyle
EvenPaceStyle      5751
AggressiveStyle    1287
Name: count, dtype: int64 

drivingStyle
EvenPaceStyle      3290
AggressiveStyle     802
Name: count, dtype: int64 

drivingStyle
EvenPaceStyle      7716
AggressiveStyle     483
Name: count, dtype: int64 

drivingStyle
EvenPaceStyle      4259
AggressiveStyle     187
Name: count, dtype: int64 



In [6]:
# adding 'make' columns before joining the datasets
opel1['make'] = 'opel'
opel2['make'] = 'opel'
peugeot1['make'] = 'peugeot'
peugeot2['make'] = 'peugeot'

In [7]:
# joining the datasets
df = pd.concat([opel1,opel2,peugeot1,peugeot2])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23775 entries, 0 to 4445
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 23775 non-null  int64  
 1   AltitudeVariation          23775 non-null  float64
 2   VehicleSpeedInstantaneous  23766 non-null  float64
 3   VehicleSpeedAverage        23775 non-null  float64
 4   VehicleSpeedVariance       23775 non-null  float64
 5   VehicleSpeedVariation      23775 non-null  float64
 6   LongitudinalAcceleration   23775 non-null  float64
 7   EngineLoad                 23770 non-null  float64
 8   EngineCoolantTemperature   23770 non-null  float64
 9   ManifoldAbsolutePressure   23770 non-null  float64
 10  EngineRPM                  23770 non-null  float64
 11  MassAirFlow                23770 non-null  float64
 12  IntakeAirTemperature       23770 non-null  float64
 13  VerticalAcceleration       23775 non-null  float64
 

In [8]:
df.shape

(23775, 19)

In [9]:
# check for null values
df.isnull().sum()

Unnamed: 0                   0
AltitudeVariation            0
VehicleSpeedInstantaneous    9
VehicleSpeedAverage          0
VehicleSpeedVariance         0
VehicleSpeedVariation        0
LongitudinalAcceleration     0
EngineLoad                   5
EngineCoolantTemperature     5
ManifoldAbsolutePressure     5
EngineRPM                    5
MassAirFlow                  5
IntakeAirTemperature         5
VerticalAcceleration         0
FuelConsumptionAverage       5
roadSurface                  0
traffic                      0
drivingStyle                 0
make                         0
dtype: int64

In [10]:
# drop null values, the total amount is 44, which is a small percentage of 23775
df.dropna(inplace=True)

In [11]:
# double check for null values
df.isnull().sum()

Unnamed: 0                   0
AltitudeVariation            0
VehicleSpeedInstantaneous    0
VehicleSpeedAverage          0
VehicleSpeedVariance         0
VehicleSpeedVariation        0
LongitudinalAcceleration     0
EngineLoad                   0
EngineCoolantTemperature     0
ManifoldAbsolutePressure     0
EngineRPM                    0
MassAirFlow                  0
IntakeAirTemperature         0
VerticalAcceleration         0
FuelConsumptionAverage       0
roadSurface                  0
traffic                      0
drivingStyle                 0
make                         0
dtype: int64

In [12]:
# drop duplicates if any
df = df.drop_duplicates()

In [13]:
#convert the 'Unnamed: 0' column to float to match the rest of the columns with numerical value
df['Unnamed: 0'] = df['Unnamed: 0'].astype('float') 

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
# encode 'roadSurface', 'traffic' and 'make' columns using OneHotEncoder

# check unique value for each column
print('Unique values for each column:\n')
print('roadSurface column: ',df['roadSurface'].unique())
print('traffic column: ',df['traffic'].unique())
print('make column: ',df['make'].unique())

# encode the column into a new dataframe
encoder = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
encoded_columns = encoder.fit_transform(df[['roadSurface','traffic','make']])
encoded_columns

Unique values for each column:

roadSurface column:  ['SmoothCondition' 'UnevenCondition' 'FullOfHolesCondition']
traffic column:  ['LowCongestionCondition' 'NormalCongestionCondition'
 'HighCongestionCondition']
make column:  ['opel' 'peugeot']


Unnamed: 0,roadSurface_FullOfHolesCondition,roadSurface_SmoothCondition,roadSurface_UnevenCondition,traffic_HighCongestionCondition,traffic_LowCongestionCondition,traffic_NormalCongestionCondition,make_opel,make_peugeot
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
4441,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4442,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4443,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4444,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [16]:
#dropping transformed columns
df=df.drop(columns=['roadSurface','traffic','make'],axis=1)
df

Unnamed: 0.1,Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage,drivingStyle
0,59.0,-2.299988,25.670519,13.223501,121.592690,-2.476980,0.3555,4.705883,68.0,106.0,1796.0,15.810000,24.0,-0.1133,19.497335,EvenPaceStyle
1,60.0,-2.099976,24.094259,13.638919,120.422571,-1.576260,0.4492,10.588236,68.0,103.0,1689.0,14.650000,22.0,-0.1289,19.515722,EvenPaceStyle
2,61.0,-1.500000,22.743179,14.031043,118.456769,-1.351080,0.4258,27.450981,68.0,103.0,1599.0,11.850000,21.0,-0.1328,19.441765,EvenPaceStyle
3,62.0,0.100037,22.292820,14.171073,117.571308,-0.450359,0.4140,24.313726,69.0,104.0,1620.0,12.210000,20.0,-0.0859,19.388769,EvenPaceStyle
4,63.0,0.099976,23.643900,14.328954,117.074149,1.351080,0.3945,20.000000,69.0,104.0,1708.0,11.910000,21.0,-0.0664,19.301638,EvenPaceStyle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4441,4618.0,1.000000,28.799999,28.559999,57.190571,3.600000,-0.0292,25.882353,81.0,115.0,1755.5,20.469999,25.0,-0.1661,14.578003,EvenPaceStyle
4442,4619.0,1.699997,30.599998,28.529999,57.010266,1.799999,-0.0304,11.764706,81.0,106.0,736.5,17.740000,25.0,-0.1987,14.585642,EvenPaceStyle
4443,4620.0,1.800003,29.699999,28.499999,56.883045,-0.900000,-0.1684,98.039215,81.0,106.0,1254.0,9.520000,24.0,-0.1156,14.547294,EvenPaceStyle
4444,4621.0,2.100006,29.699999,28.409999,56.160910,0.000000,-0.0644,79.607841,80.0,112.0,1254.0,14.910000,23.0,-0.0760,14.546828,EvenPaceStyle


In [17]:
# joining encoded column with the main dataframe
df = pd.concat([df,encoded_columns], axis=1)
df

Unnamed: 0.1,Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,...,FuelConsumptionAverage,drivingStyle,roadSurface_FullOfHolesCondition,roadSurface_SmoothCondition,roadSurface_UnevenCondition,traffic_HighCongestionCondition,traffic_LowCongestionCondition,traffic_NormalCongestionCondition,make_opel,make_peugeot
0,59.0,-2.299988,25.670519,13.223501,121.592690,-2.476980,0.3555,4.705883,68.0,106.0,...,19.497335,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,60.0,-2.099976,24.094259,13.638919,120.422571,-1.576260,0.4492,10.588236,68.0,103.0,...,19.515722,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,61.0,-1.500000,22.743179,14.031043,118.456769,-1.351080,0.4258,27.450981,68.0,103.0,...,19.441765,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,62.0,0.100037,22.292820,14.171073,117.571308,-0.450359,0.4140,24.313726,69.0,104.0,...,19.388769,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,63.0,0.099976,23.643900,14.328954,117.074149,1.351080,0.3945,20.000000,69.0,104.0,...,19.301638,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4441,4618.0,1.000000,28.799999,28.559999,57.190571,3.600000,-0.0292,25.882353,81.0,115.0,...,14.578003,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4442,4619.0,1.699997,30.599998,28.529999,57.010266,1.799999,-0.0304,11.764706,81.0,106.0,...,14.585642,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4443,4620.0,1.800003,29.699999,28.499999,56.883045,-0.900000,-0.1684,98.039215,81.0,106.0,...,14.547294,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4444,4621.0,2.100006,29.699999,28.409999,56.160910,0.000000,-0.0644,79.607841,80.0,112.0,...,14.546828,EvenPaceStyle,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [194]:
# transform the target 'drivingStyle' column using label encoder to keep the values in one column
# check unique values
driving_styles = df['drivingStyle'].unique()
driving_styles

drivingStyle
1    21003
0     2759
Name: count, dtype: int64

In [19]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
# feed driving_styles array into the label encoder
label_encoder.fit(driving_styles)

In [20]:
# transform the column
df['drivingStyle'] = label_encoder.transform(df['drivingStyle'])

In [216]:
# check modified column
df['drivingStyle'].unique()

array([1, 0])

Based on the index position of the two arrays, 'EvenPaceStyle'= 1 and 'AggressiveStyle' = 0

## Data split and balancing

In [145]:
# split the data into train and test dataset
train, test = train_test_split(df, test_size=0.2, random_state=1)

In [147]:
#check the shape of each dataset
print(train.shape)
print(test.shape)

(19009, 24)
(4753, 24)


In [148]:
# define features and target variables for train and test
x_train = train.drop('drivingStyle',axis=1)
y_train = train['drivingStyle']

x_test = test.drop('drivingStyle',axis=1)
y_test = test['drivingStyle']

In [228]:
from imblearn.under_sampling import RandomUnderSampler

In [246]:
#balance the datasets
#check class count before balancing
print('y_train class count: ',y_train.value_counts())
print('y_test class count: ',y_test.value_counts())

y_train class count:  drivingStyle
1    16835
0     2174
Name: count, dtype: int64
y_test class count:  drivingStyle
1    4168
0     585
Name: count, dtype: int64


In [313]:
#balance train and test data
undersampler = RandomUnderSampler()

x_train_balanced, y_train_balanced = undersampler.fit_resample(x_train, y_train)
x_test_balanced, y_test_balanced = undersampler.fit_resample(x_test, y_test)

In [250]:
#check class count balancing
print('y_train_balanced class count: ',y_train_balanced.value_counts())
print('y_test_balanced class count: ',y_test_balanced.value_counts())

y_train_balanced class count:  drivingStyle
0    2174
1    2174
Name: count, dtype: int64
y_test_balanced class count:  drivingStyle
0    585
1    585
Name: count, dtype: int64


## Feature selection and scaling

In [149]:
from sklearn.ensemble import RandomForestClassifier

In [252]:
# determining useful features

# create the model
model = RandomForestClassifier(n_jobs=-1)

# train the model
model.fit(x_train_balanced, y_train_balanced)

In [319]:
# check the accuracy of the model
model.score(x_test_balanced,y_test_balanced)

0.8948717948717949

In [321]:
# get the importances
importances = model.feature_importances_

In [323]:
# Create a data frame
forest_df = pd.DataFrame({"Features": x_train.columns, "Importances":importances})
forest_df.set_index('Importances')

# Sort in descending order 
forest_df = forest_df.sort_values('Importances', ascending=False)

In [325]:
forest_df

Unnamed: 0,Features,Importances
2,VehicleSpeedInstantaneous,0.115506
4,VehicleSpeedVariance,0.096505
3,VehicleSpeedAverage,0.083347
14,FuelConsumptionAverage,0.067639
6,LongitudinalAcceleration,0.062397
13,VerticalAcceleration,0.061296
0,Unnamed: 0,0.055458
10,EngineRPM,0.047101
11,MassAirFlow,0.046193
1,AltitudeVariation,0.046161


In [262]:
# creating a list of the selected features
Features = ['VehicleSpeedInstantaneous',
            'VehicleSpeedVariance',
            'VehicleSpeedAverage']

# create new feature and test dataframes
x_training = x_train_balanced[Features]
x_testing = x_test_balanced[Features]

In [264]:
from sklearn.preprocessing import MinMaxScaler

In [266]:
# scaling selected features

# create scaler
scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_training)
x_test_scaled = scaler.fit_transform(x_testing)

## Model trainign and evaluation

In [268]:
from sklearn.linear_model import LogisticRegression

In [272]:
# crete Logistic regression model
log_model = LogisticRegression(solver='liblinear')

# train the model
log_model.fit(x_train_scaled, y_train_balanced)

In [330]:
# print the score 
lr_score = log_model.score(x_test_scaled,y_test_balanced)
print('Logistic Regression score: ',lr_score)

Logistic Regression score:  0.6034188034188034


In [340]:
#coefficient
coef = log_model.coef_
print('Coefficients: ',coef)

Coefficients:  [[ 2.30105406 -3.67054962  0.49016587]]


In [346]:
#prediction
log_predict = log_model.predict(x_test_scaled)

#create result df
log_result = pd.Series(log_predict)

In [358]:
# result count
log_count = log_result.value_counts()
print(log_count)
print('EvenPaceStyle= 1')
print('AggressiveStyle = 0')

0    665
1    505
Name: count, dtype: int64
EvenPaceStyle= 1
AggressiveStyle = 0


In [332]:
from sklearn.svm import SVC

In [334]:
# create SVM model
svm_model = SVC(kernel='linear', C=3.0)

#train the model
svm_model.fit(x_train_scaled, y_train_balanced)

In [336]:
# print the score 
svm_score = svm_model.score(x_test_scaled,y_test_balanced)
print('SVM score: ',svm_score)

SVM score:  0.6162393162393163


In [360]:
#prediction
svm_predict = svm_model.predict(x_test_scaled)

#create result df
svm_result = pd.Series(log_predict)

In [362]:
# result count
svm_count = svm_result.value_counts()
print(svm_count)
print('EvenPaceStyle= 1')
print('AggressiveStyle = 0')

0    665
1    505
Name: count, dtype: int64
EvenPaceStyle= 1
AggressiveStyle = 0
