# Creating a machine learning model from the data prep in BigG_AC 

**Two metholodogies will be implemented in the following models, one taking in consideration the model per truck, or one where the model takes into consideration the timestamps**

In [17]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
Diagnostics_5246 = pd.read_csv('data/Diagnostics_5246.csv')
Diagnostics_1569 =  pd.read_csv('data/Diagnostics_1569.csv')

In [3]:
Diagnostics_5246

Unnamed: 0.1,Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,...,FuelRate,FuelTemperature,IntakeManifoldTemperature,LampStatus,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,target
0,0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,
1,1,2,990360,2015-02-21 11:34:34,,unknown,unknown,unknown,unknown,11,...,25.395354,25.395354,25.395354,25.395354,25.395354,25.395354,25.395354,25.395354,25.395354,
2,2,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,...,39.459260,39.459260,39.459260,39.459260,39.459260,39.459260,39.459260,39.459260,39.459260,
3,3,6,990431,2015-02-21 11:40:22,Low (Severity Low) Engine Coolant Level,04993120*00025921*082113134117*07700053*I0*BBZ*,79466580,6X1u10D1500000000,CMMNS,0,...,13.602200,13.602200,13.602200,13.602200,13.602200,13.602200,13.602200,13.602200,13.602200,
4,4,7,990439,2015-02-21 11:40:52,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,...,41.534780,41.534780,41.534780,41.534780,41.534780,41.534780,41.534780,41.534780,41.534780,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546659,546659,1248448,123899434,2020-03-06 13:12:43,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,...,0.941766,0.941766,0.941766,0.941766,0.941766,0.941766,0.941766,0.941766,0.941766,
546660,546660,1248452,123901805,2020-03-06 13:42:48,Low (Severity Medium) Engine Coolant Level,04358814*06030918*051718174436*09401683*G1*BDR*,79904453,6X1u13D1500000000,CMMNS,0,...,5.932153,5.932153,5.932153,5.932153,5.932153,5.932153,5.932153,5.932153,5.932153,
546661,546661,1248455,123905139,2020-03-06 14:04:23,Condition Exists Engine Protection Torque Derate,04358814*06099720*030816202706*09400153*G1*BDR*,79932020,6X1u13D1500000000,CMMNS,0,...,65.010960,65.010960,65.010960,65.010960,65.010960,65.010960,65.010960,65.010960,65.010960,
546662,546662,1248456,123905996,2020-03-06 14:13:38,Abnormal Rate of Change Aftertreatment 1 Intak...,05317106*05100987*050719120655*09401585*G1*BDR*,79880653,6X1u13D1500000000,CMMNS,0,...,66.574100,66.574100,66.574100,66.574100,66.574100,66.574100,66.574100,66.574100,66.574100,


Splitting the trucks between derate and non derate. For Diagnostics_5246

In [4]:
all_trucks = Diagnostics_5246['EquipmentID'].unique()
derate_trucks = Diagnostics_5246.loc[Diagnostics_5246['spn'] == 5246]['EquipmentID'].unique()
no_derate_trucks = all_trucks[np.isin(all_trucks, derate_trucks, invert=True)]

- put the two lists together (marking 1 for trucks with derate, 0 with non):

In [5]:
trucks_df = pd.concat([
            pd.DataFrame({'EquipmentID': derate_trucks, 'derate': 1}),
            pd.DataFrame({'EquipmentID': no_derate_trucks, 'derate': 0}) 
            ])

- Using trest_train_split and “stratify” to ensure the proportions of derate/non-derate stay same in the samples:

In [6]:
trucks_train, trucks_test = train_test_split(trucks_df, stratify=trucks_df['derate'], train_size = 0.8, test_size = 0.2, random_state = 42)

- And then you’d use something like below to extract and train:

In [7]:
train_data = Diagnostics_5246.loc[Diagnostics_5246['EquipmentID'].isin(trucks_train['EquipmentID'])]

- Dropping the columns that will not be considered in the model

In [8]:
Diagnostics_5246 = Diagnostics_5246.drop(columns = ['Unnamed: 0', 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake',
       'ecuSource', 'eventDescription', 'EventTimeStamp_DateOnly', 'LocationTimeStamp', 'LocationTimeStamp_DateOnly'])
Diagnostics_1569 = Diagnostics_1569.drop(columns = ['Unnamed: 0','ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake',
       'ecuSource'])


- Spliting the data to train and test to try to get a classifier model 

In [15]:
# separate the features and target variable in the training and testing sets
X_train = trucks_train.drop('derate', axis=1)
y_train = trucks_train['derate']

X_test = trucks_test.drop('derate', axis=1)
y_test = trucks_test['derate']

In [18]:
# train a decision tree classifier on the training set
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [19]:
# make predictions on the testing set
y_pred = classifier.predict(X_test)

In [20]:
# evaluate the performance of the classifier using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.75


- Tring to improve the model with different classifier.

In [None]:
pipe = Pipeline(
    steps = [
        ('scaler', MinMaxScaler()),
        ('nn', MLPClassifier(hidden_layer_sizes = (2,),
                             activation = 'tanh',
                             max_iter = 10000))
    ]
)

In [None]:
pipe.fit(X_train, y_train) 

In [None]:
numeric_cols