In [1]:
import pandas as pd
df = pd.read_csv('NewFeatures.csv') #NewFeatures.csv has trucks labeled as 1 for failure and 2 for nonfailures 15-30 days after service


In [2]:
df['Failure'].value_counts() #Counts for the "Failure" feature of values 0, 1, and 2
                            #1 represents failure, 2 represents the new set of nonfailures

0    108971
1      1380
2      1356
Name: Failure, dtype: int64

In [4]:
df = df[df['distance_miles']!= 0]  #Removes truck entries for nonactive days

In [5]:
only1and2 = df[df['Failure']!= 0]   #Removes the original set of nonfailures

In [6]:
only1and2 = only1and2.drop(['dpf_regen_inhibit_switch_not_active_duration_mins', 'min_regen_inhibit_switch_not_active_per_mile'], axis = 1) #Drop irrelevant features

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


X=only1and2[['percent_fuel_lost','fuels_used_per_mile',
             'miles_per_minutes', 'mins_idle_per_mile', 'fuel_lost_per_mile']]

y=only1and2['Failure']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf=RandomForestClassifier(n_estimators=200)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))

Accuracy: 0.5474956822107081
Recall: 0.6014492753623188
Precision: 0.5220125786163522


In [8]:
feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp #Top 5 Feature Importances

miles_per_minutes      0.212897
fuels_used_per_mile    0.208518
percent_fuel_lost      0.193991
fuel_lost_per_mile     0.193292
mins_idle_per_mile     0.191302
dtype: float64