# Creating a machine learning model from the data prep in BigG_AC 

**Two metholodogies will be implemented in the following models, one taking in consideration the model per truck, or one where the model takes into consideration the timestamps**

In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [2]:
Diagnostics_5246 = pd.read_csv('data/Diagnostics_5246.csv')
Diagnostics_1569 =  pd.read_csv('data/Diagnostics_1569.csv')

- Dropping the columns that will not be considered in the model

In [3]:
#Diagnostics_5246 = Diagnostics_5246.drop(columns = ['RecordID', 'EquipmentID','Unnamed: 0', 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake',
 #      'ecuSource', 'eventDescription', 'EventTimeStamp_DateOnly', 'LocationTimeStamp', 'LocationTimeStamp_DateOnly''active', 'FaultId','spn', 'fmi'])
#Diagnostics_1569 = Diagnostics_1569.drop(columns = ['Unnamed: 0','ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake',
  #     'ecuSource'])

Splitting the trucks between derate and non derate. For Diagnostics_5246

In [4]:
all_trucks = Diagnostics_5246['EquipmentID'].unique()
derate_trucks = Diagnostics_5246.loc[Diagnostics_5246['spn'] == 5246]['EquipmentID'].unique()
no_derate_trucks = all_trucks[np.isin(all_trucks, derate_trucks, invert=True)]

- put the two lists together (marking 1 for trucks with derate, 0 with non):

In [5]:
trucks_df = pd.concat([
            pd.DataFrame({'EquipmentID': derate_trucks, 'derate': 1}),
            pd.DataFrame({'EquipmentID': no_derate_trucks, 'derate': 0}) 
            ])

Using trest_train_split and “stratify” to ensure the proportions of derate/non-derate stay same in the samples:


- And then you’d use something like below to extract and train:

In [6]:
def test_split_data_custom(Diagnostics_5246, spn, percentage):
    all_trucks = Diagnostics_5246['EquipmentID'].unique()
    derate_trucks = Diagnostics_5246.loc[Diagnostics_5246['spn'] == spn]['EquipmentID'].unique()
    no_derate_trucks = all_trucks[np.isin(all_trucks, derate_trucks, invert=True)]


    #shuffle(sklearn) the array so that we get ramdom sequence in sample split
    all_Equip_s = shuffle(all_trucks, random_state=42)
    derate_Equip_s = shuffle(derate_trucks, random_state=42)
    no_derate_Equip_s = shuffle(no_derate_trucks, random_state=42)


    print(len(all_Equip_s))
    print(len(derate_Equip_s))
    print(len(no_derate_Equip_s))

    #convert to dataframe to locate rows
    df_all_Equip = pd.DataFrame(all_Equip_s, columns = ['EquipmentID'])
    df_derate_Equip = pd.DataFrame(derate_Equip_s, columns = ['EquipmentID'])
    df_no_derate_Equip = pd.DataFrame(no_derate_Equip_s, columns = ['EquipmentID'])


    #get the equipments % based on the passed percentage
    #keep_rows = int(total_rows * percentage)
    keep_rows_nde = int(len(no_derate_Equip_s) * percentage)
    print(keep_rows_nde)

    #============================================
    #STEP- 4 Get X1 for no_derate_Equip
    #============================================
    df_nde_pct_X1_train = df_no_derate_Equip.iloc[:keep_rows_nde, :]
    print(df_nde_pct_X1_train)

    #get rest of the percentage by doing not isin  lookup
    df_nde_pct_X1_test = df_no_derate_Equip[np.isin(df_no_derate_Equip,df_nde_pct_X1_train, invert=True)]
    print(df_nde_pct_X1_test) 

    #Make sure data is correct by checking we dont have equip id in both dataframes
    should_be_zero_nde= df_nde_pct_X1_train[np.isin(df_nde_pct_X1_train,df_nde_pct_X1_test)]
    print("should_be_zero nde = "  + str(should_be_zero_nde.size)) 

    #============================================
    #STEP 5 get X2 or train and Test
    #============================================
    keep_rows_de = int(len(derate_Equip_s) * percentage)
    print(keep_rows_de)

    df_de_pct_X2_train = df_derate_Equip.iloc[:keep_rows_de, :]
    print(df_de_pct_X2_train)

    #get rest of the percentage by doing not isin  lookup
    df_de_pct_X2_test = df_derate_Equip[np.isin(df_derate_Equip,df_de_pct_X2_train, invert=True)]
    print(df_de_pct_X2_test) 

    #Make sure data is correct by checking we dont have equip id in both dataframes
    should_be_zero_de= df_de_pct_X2_train[np.isin(df_de_pct_X2_train,df_de_pct_X2_test)]
    print("should_be_zero derate = "  + str(should_be_zero_de.size)) 


    # now we have the id ready for trian and test from both derate and no derate data frames. 
    # combine df_nde_pct_X1_train & df_de_pct_X2_train, combine df_nde_pct_X1_test and df_de_pct_X2_test
    combined_df_X_train = pd.concat([df_nde_pct_X1_train, df_de_pct_X2_train], ignore_index=True)
    print(combined_df_X_train)


    # combine df_nde_pct_X1_test & df_de_pct_X2_test, combine df_de_pct_X2_test and df_de_pct_X2_test
    combined_df_X_test = pd.concat([df_nde_pct_X1_test, df_de_pct_X2_test], ignore_index=True)
    print(combined_df_X_test)



    #STEP 6 - Get X_train, y_train and X_test. y_test 
    # Next filter the data from original dataframe for X, y

    df_train = Diagnostics_5246[Diagnostics_5246['EquipmentID'].isin(combined_df_X_train['EquipmentID'].tolist())]
    X_train= df_train.drop(columns=['target'])
    y_train = df_train['target']

    df_test = Diagnostics_5246[Diagnostics_5246['EquipmentID'].isin(combined_df_X_test['EquipmentID'].tolist())]
    X_test =df_test.drop(columns=['target'])
    y_test = df_test['target']

    # x_train, x_test, y_train, y_test

    return X_train, X_test, y_train, y_test

- Spliting the data to train and test to try to get a classifier model 

In [7]:
Diagnostics_5246['spn_fmi'] = ['_'.join(i) for i in zip(Diagnostics_5246['spn'].astype(str), Diagnostics_5246['fmi'].astype(str))]

Diagnostics_5246 = pd.get_dummies(Diagnostics_5246, columns=['spn_fmi'], prefix='spn_fmi')

#Diagnostics_5246 = Diagnostics_5246.sort_values(by=['EquipmentID', 'EventTimeStamp'])

# to obtain the one hot encoded columns since there are so many
#faults_cols = ['EventTimeStamp'] + [col for col in Diagnostics_5246.columns if 'spn_fmi' in col] 

##diagnostics_cols = ['EventTimeStamp', 'activeTransitionCount', 'AcceleratorPedal',
#         'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
#         'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 
#        'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 
#        'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'LampStatus',
#        'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure', 'target']

In [8]:
cols_to_drop = ['Unnamed: 0','RecordID', 'EquipmentID','EventTimeStamp', 
                'EventTimeStamp_DateOnly',  'LocationTimeStamp_DateOnly', 'LocationTimeStamp',
               'active', 'FaultId','spn', 'fmi', 'eventDescription',
       'eventDescription', 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel',
       'ecuMake', 'ecuSource']

In [9]:
#faults_1 = (
#    Diagnostics_5246
#    .groupby('EquipmentID')[faults_cols]
 #   .max()
  #  )

#faults_2 = (
#     Diagnostics_5246
#    .groupby('EquipmentID')[diagnostics_cols]
#    .mean()
#    )

#faults_1 = faults_1.reset_index()
#faults_2 = faults_2.reset_index()

In [10]:
#faults_merged = pd.merge(Diagnostics_5246['RecordID'], #[['RecordID'] + diagnostics_cols]
 #                         faults_1,
  #                        left_index= True,
   #                       right_on = 'EquipmentID').drop(columns='EquipmentID')

In [11]:
#faults_merged_2 = pd.merge(Diagnostics_5246['RecordID'], #[['RecordID'] + diagnostics_cols]
 #                         faults_2,
  #                        left_index= True,
   #                       right_on = 'EquipmentID').drop(columns='EquipmentID')


In [12]:
#faults_diagnostics =  faults_merged.merge(faults_merged_2, on = 'RecordID')

In [13]:
#faults_diagnostics = Diagnostics_5246.iloc[:, :-1]

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = test_split_data_custom(Diagnostics_5246, 1569, 0.8)

1039
491
548
438
     EquipmentID
0           2093
1           1695
2           2052
3           2188
4           2302
..           ...
433         2327
434         1770
435         1760
436         1672
437         1767

[438 rows x 1 columns]
     EquipmentID
438         2371
439         1674
440         2169
441         2046
442         1412
..           ...
543         1336
544         1518
545         2135
546         2248
547         1333

[110 rows x 1 columns]
should_be_zero nde = 0
392
     EquipmentID
0           1944
1           1517
2           1921
3           1949
4           1763
..           ...
387         1888
388         1718
389         1744
390         1971
391         1561

[392 rows x 1 columns]
     EquipmentID
392         1887
393         1557
394         1812
395         1758
396         1883
..           ...
486         1450
487         1615
488         1593
489         1891
490         1815

[99 rows x 1 columns]
should_be_zero derate = 0
     EquipmentID
0 

In [16]:
from imblearn.over_sampling import SMOTE

oversampler = SMOTE(k_neighbors=5, random_state=42)
X_smote, y_smote = oversampler.fit_resample(X_train.drop(columns=cols_to_drop), y_train.drop(columns=cols_to_drop))
y_smote.value_counts()

0    429488
1    429488
Name: target, dtype: int64

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the model
gbm = GradientBoostingClassifier(verbose=True)

In [18]:
# Fit the model to the training data
gbm.fit(X_train, y_train)

ValueError: could not convert string to float: '2015-02-21 11:35:33'

In [None]:
# Predict the labels of the test data
y_pred = gbm.predict(X_test)

In [None]:
# train a decision tree classifier on the training set
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [None]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
importances = pd.Series(gbm.feature_importances_, index=X_test.columns)
importances

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

In [None]:
# make predictions on the testing set
y_pred = classifier.predict(X_test)

In [None]:
# Calculate the classification report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# evaluate the performance of the classifier using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
# Get feature importances
importances = classifier.feature_importances_
print(importances)

In [None]:
# Get feature importances
importances = pd.Series(clf.feature_importances_, index=X_test.columns)
print(importances)

In [None]:
importances_sorted = importances.sort_values(ascending=False)

In [None]:
plt.figure()
importances_sorted.plot(kind='barh')
plt.title('Feature importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

In [None]:
from sklearn import tree
from sklearn.tree import plot_tree

text_representation = tree.export_text(classifier)
print(text_representation)

In [None]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Create a heatmap of the confusion matrix
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='.0f')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

- Tring to improve the model with different classifier.

In [None]:
pipe = Pipeline(
    steps = [
        ('scaler', MinMaxScaler()),
        ('nn', MLPClassifier(hidden_layer_sizes = (2,),
                             activation = 'tanh',
                             max_iter = 10000))
    ]
)

In [None]:
pipe.fit(X_train, y_train) 


In [None]:
accuracy_2 = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy_2)

In [None]:
#from imblearn.over_sampling import SMOTE

#oversampler = SMOTE(k_neighbors=5, random_state=321)

In [None]:
#X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [None]:
#y_smote.value_counts()

In [None]:
lr = LogisticRegression()
lr.fit(X_smote, y_smote)

In [None]:
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
coefficients = lr.coef_

feature_names = X_test.columns

coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients[0]})
coefficients_df

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues');


In [None]:
# Create a heatmap of the confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='.0f')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))