In [1]:
import pandas as pd 
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("dataset_main.csv")

In [3]:
dfg = df.groupby('failure_cause').count().reset_index()
dfg

Unnamed: 0,failure_cause,serial_number,manufacturer,kva,age,region,failure_date
0,Animal,21,21,21,21,21,21
1,Car Hit Pole,21,21,21,21,21,21
2,Corrosion,16,16,16,16,16,16
3,Fire,18,18,18,18,18,18
4,Overload,24,24,24,24,24,24


In [None]:
df.head()

In [None]:
df['failure_month'] = df['failure_date'].map(lambda x: int(x.split('/')[0]))

In [None]:
df = df.drop(['serial_number', 'failure_date'], axis=1)

In [None]:
df_dummies = pd.get_dummies(df[['manufacturer', 'region']], prefix=['manufacturer', 'region']).join(pd.get_dummies(df['kva'],prefix='kva'))
df_dummies.head()

In [None]:
df = df.drop(['manufacturer', 'kva', 'region'], axis=1).join(df_dummies)

In [None]:
df.head(20)

In [None]:
big_list = []
count = 0
for one in df['serial_number']:
  if one not in big_list:
    count += 1
    big_list.append(one)

df['Dummy'] = [1 for _ in range(100)]


In [None]:
fig = px.bar(df, x="age", y="Dummy", color='manufacturer', barmode='group')
fig.show()

In [None]:
fig = px.bar(df, x="", y="Dummy", color="Failure Cause", barmode='group')
fig.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics


# Naive Bayes
text_clf_nb = Pipeline([
  ('tfidf', TfidfVectorizer()),
  ('clf', MultinomialNB())
])

## Logistic Regression
text_clf_lr = Pipeline([
  ('tfidf', TfidfVectorizer()),
  ('clf', LogisticRegression())
])

### Linear SVC: ###
text_clf_lsvc = Pipeline([
  ('tfidf', TfidfVectorizer()),
  ('clf', LinearSVC())
])

In [None]:
mapper = {
  'Fire': 0,
  'Overload': 1,
  'Car Hit Pole': 2,
  'Animal': 3,
  'Corrosion': 4
}

df_test = df.copy()

df_test['failure_cause'] = df_test['failure_cause'].map(lambda x: mapper[x])

In [None]:
from sklearn.model_selection import train_test_split

X = df_test.drop('failure_cause', axis=1)
y = df_test['failure_cause']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

In [None]:
y_train.shape

In [None]:
nb = MultinomialNB()
lr = LogisticRegression()
lsvc = LinearSVC()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

nb.fit(X_train, y_train)
lr.fit(X_train, y_train)
lsvc.fit(X_train, y_train)

In [None]:
pred_nb = nb.predict(X_test)

print(metrics.confusion_matrix(y_test,pred_nb))
print(metrics.classification_report(y_test,pred_nb))

In [None]:
pred_nb = lr.predict(X_test)

print(metrics.confusion_matrix(y_test,pred_nb))
print(metrics.classification_report(y_test,pred_nb))

In [None]:
pred_nb = lsvc.predict(X_test)

print(metrics.confusion_matrix(y_test,pred_nb))
print(metrics.classification_report(y_test,pred_nb))

In [None]:
error_rate = []

for i in range(1,11):
  knn.fit(X_train, y_train)
  pred_knn = knn.predict(X_test)
  error_rate.append(np.mean(pred_knn != y_test))
  
plt.figure(figsize=(10,6))
plt.plot(range(1,11), error_rate, color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')