# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib

# Importing Dataset

In [2]:
dataset = pd.read_csv('../../data/engine_data.csv')
X = dataset.iloc[:, 0:7].values
y = dataset.iloc[:, [11]].values

# Data PreProcessing

In [3]:
# Encoding categorical data
# this is to convert true and false to 1s and 0s
y = y * 1

labelencoder_X_0 = LabelEncoder()
X[:, 0] = labelencoder_X_0.fit_transform(X[:, 0])
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
onehotencoder = OneHotEncoder(categorical_features = [0, 1])
X = onehotencoder.fit_transform(X).toarray()

# avoid dummy variable trap
categories = [0, 1]
dummies = []
dummies_sum = 0

for category in categories:
    dummies_sum += (dataset.iloc[:, category].unique().size) * category
    dummies.append(dummies_sum)

X = np.delete(X, dummies, 1)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training and Predicting the dataset with Random Forest Algorithm

In [4]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Result

In [5]:
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
report = classification_report(y_test, y_pred)
print(report)

             precision    recall  f1-score   support

          0       0.98      1.00      0.99     18697
          1       1.00      0.73      0.84      1303

avg / total       0.98      0.98      0.98     20000



# Saving the Model

In [6]:
joblib.dump(classifier, 'rf_classifier.pkl') 