In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<img src="https://www.elprocus.com/wp-content/uploads/2014/02/Copy-of-DSCN9922.jpg" width=800 />

# <font color = 'Fire'><center> <b>Electrical Faults Detection and Classsification</b> </center>

### What are Electrical Faults?
> Normally, a power system operates under balanced conditions. When the system becomes unbalanced due to the failures of insulation at any point or due to the contact of live wires, a short–circuit or fault, is said to occur in the line. Faults may occur in the power system due to the number of reasons like natural disturbances (lightning, high-speed winds, earthquakes), insulation breakdown, falling of a tree, bird shorting, etc.


### Types of Faults?
> Faults can be brodly categorised into two types:
> 1. Open-circuit Fault and
> 2. Short-Circuit Faults
    * Symmetrical and 
    * Asymmetrical Faults
    
### Symmetrical and Asymmetrical Faults
> #### Symmetrical 
> * In symmetrical faults, all phases are shorted to each other or to earth (L-L-L) or (L-L-L-G).
> * The nature of this type of fault is balanced.
> * In this type of fault,fault currents in all phases are symmetrical i.e. their magnitudes are equal and they are equally displaced by angle 120 degree.
> * It is more severe type of fault but it occurs rarely.

> #### Asymmetrical 
> * These faults involve only one or two phases.
> * In this type of fault, three phase lines become unbalanced.
> * There are mainly three types namely line to ground (L-G), line to line (L-L) and double line to ground (LL-G) faults.
> * These type of faults mostly occur on power system..





<center> <img src="https://ars.els-cdn.com/content/image/1-s2.0-S2314717217300065-gr1.jpg" width="600" height="200"> </center>
<br>
So here we are trying to classify Short-Circuit faults into further categories based on the values of line voltages and Line Currents.

In [None]:
ds = pd.read_csv("/kaggle/input/electrical-fault-detection-and-classification/detect_dataset.csv")
cs = pd.read_csv("/kaggle/input/electrical-fault-detection-and-classification/classData.csv")

ds.head()

In [None]:
cs.head()

#### So we have two files(datasets) here namely ds and cs:
* ds is for train the model to detect any type of Fault and<br>
* cs is for Classification of Shunt Faults.


**This file _ds_ contains the dataset to classify the types of fault.**<br>

> A,B,C are the 3-phases of the electrical system. Most of the Electricity transmission happens via 3-phase system,<br>
and hence **Ia** represents the current(I) in phase A, **Va** represents the Voltage(V) in phase A and so on for Phase A and B.

Inputs - [Ia,Ib,Ic,Va,Vb,Vc]<br>
Outputs - [G C B A]<br>

Examples :<br>
[0 0 0 0] - No Fault<br>
[1 0 0 1] - LG fault (Between Phase A and Gnd)<br>
[0 0 1 1] - LL fault (Between Phase A and Phase B)<br>
[1 0 1 1] - LLG Fault (Between Phases A,B and ground)<br>
[0 1 1 1] - LLL Fault(Between all three phases)<br>
[1 1 1 1] - LLLG fault( Three phase symmetrical fault)<br>

In [None]:
# DS
print(f"The Detect Dataset has {ds.shape[0]} rows and {ds.shape[1]} columns.")
# CS
print(f"The Dataset to be classified has {cs.shape[0]} rows and {cs.shape[1]} columns.")

In [None]:
# Dropping the last 2 columns from ddtr dataset.
ds = ds.drop(ds[['Unnamed: 7', 'Unnamed: 8']],axis=1)
ds.head()

### Lets have a quick look on data types and values of our dataset 

In [None]:
ds.info()

In [None]:
cs.info()

We don't have any type of null values in our dataset.

In [None]:
ds.describe()

In [None]:
cs.describe()

If there is any confusion regarding the values of the Line voltages, then let me clarify it that they are most probably in p.u.<br>
i.e.<br>
> ## $V_{p.u.} = \frac{V}{V_{base}} $

In actual the power system consists of 4 generators of 11 × 10^3 V. so we can convert by multiplying them by $11000$ Volts provided they have taken 11k as their base.

.........too much of info!

In [None]:
print(f"Dataset Detect has {ds.duplicated().sum()} duplicate values.")
print(f"Dataset Classify has {cs.duplicated().sum()} duplicate values.")

In [None]:
print(ds['Output (S)'].value_counts(),"\n")
sns.countplot(x=ds['Output (S)'])
plt.show()

We have a balanced dataset.

In [None]:
ds.columns

In [None]:
for i,j in enumerate(ds.columns[1:4]):
    
    print(f"For Current {j}\n")
    plt.subplots(1,2,figsize=(14,4))
    plt.subplot(121)
    sns.kdeplot(x=ds[j],color='green')

    plt.subplot(122)
    plt.hist(x=ds[j],color='darkgreen')
    plt.show()

In [None]:
for i,j in enumerate(ds.columns[4:]):
    
    print(f"For Voltage {j}\n")
    plt.subplots(1,2,figsize=(14,4))
    plt.subplot(121)
    sns.kdeplot(x=ds[j])

    plt.subplot(122)
    plt.hist(x=ds[j])
    plt.show()

In [None]:
for i,j in zip(ds.columns[1:4],ds.columns[4:]):    
    plt.subplots(1,2,figsize=(12,5))
    print(f"For Line {str(i)[1].upper()}\n")
    plt.subplot(121)
    sns.boxplot(y=ds[i],color='azure')

    plt.subplot(122)
    sns.boxplot(y=ds[j],color='azure')
    plt.show()

All the data is somewhat normally distrubuted.

In [None]:
fig = px.line(ds, x = "Ia", y = "Va", title="For Line A", color = "Output (S)")
fig.show()


In [None]:
fig = px.line(ds, x = "Ib", y = "Vb", title="For Line B", color = "Output (S)")
fig.show()

In [None]:
fig = px.line(ds, x = "Ic", y = "Vc",title="For Line C",  color = "Output (S)")
fig.show()

#### We can observe that:
* Normally the Line current varies from **-100 to 100 Amp** and Voltage p.u. between **-0.6 and 0.6**.
* While during fault, we notice some absurd and random behaviour and the value of Line current even touches **-/+800 Amp** mark.

#### Classification Dataset

In [None]:
cs['fault_types'] = cs['G'].astype('str') + cs['C'].astype('str') + cs['B'].astype('str') + cs['A'].astype('str')
cs.head()

In [None]:
print("[G C B A]\n[0 0 0 0] -> No fault \n[1 0 0 1] -> LG fault\n[0 1 1 0] -> LL fault\n[1 0 1 1] -> LLG Fault\n[0 1 1 1] -> LLL Fault\n[1 1 1 1] -> LLLG fault\n")
plt.figure(figsize=(8,5))
cs.fault_types.value_counts().plot.pie()
#add a circle at the center to transform it in a donut chart
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
plt.gca().add_artist(my_circle)
plt.title("Type of Faults")
plt.ylabel("")
plt.show()

In [None]:
NF = cs[cs['fault_types']=='0000']
LG = cs[cs['fault_types']=='1001'] 
LL = cs[cs['fault_types']=='0110'] 
LLG = cs[cs['fault_types']=='1011'] 
LLL = cs[cs['fault_types']=='0111'] 
LLLG = cs[cs['fault_types']=='1111']

In [None]:
print("For No Fault")
plt.subplots(1,3,figsize=(14,5))
plt.subplot(131)
sns.scatterplot(x=NF['Ia'],y=NF['Va'],color='red')

plt.subplot(132)
sns.scatterplot(x=NF['Ib'],y=NF['Vb'],color='green')

plt.subplot(133)
sns.scatterplot(x=NF['Ic'],y=NF['Vc'])

# plt.tight_layout()
plt.show()

In [None]:
print("For Line Ground Fault\n\nSince the fault has occured between Phase A and Ground we can notice the amount of current flowing in line A \nwhich is alomst 10 times the normal operating current.")
plt.subplots(1,3,figsize=(15,5))

plt.subplot(131)
sns.scatterplot(x=LG['Ia'],y=LG['Va'],color='red')

plt.subplot(132)
sns.scatterplot(x=LG['Ib'],y=LG['Vb'],color='green')

plt.subplot(133)
sns.scatterplot(x=LG['Ic'],y=LG['Vc'])

plt.tight_layout()
plt.show()

In [None]:
print("For Line to Line Fault")
plt.subplots(1,3,figsize=(15,5))

plt.subplot(131)
sns.scatterplot(x=LL['Ia'],y=LL['Va'],color='red')

plt.subplot(132)
sns.scatterplot(x=LL['Ib'],y=LL['Vb'],color='green')

plt.subplot(133)
sns.scatterplot(x=LL['Ic'],y=LL['Vc'])

plt.tight_layout()
plt.show()

In [None]:
print("For Line Line Ground Fault")
plt.subplots(1,3,figsize=(15,5))

plt.subplot(131)
sns.scatterplot(x=LLG['Ia'],y=LLG['Va'],color='red')

plt.subplot(132)
sns.scatterplot(x=LLG['Ib'],y=LLG['Vb'],color='green')

plt.subplot(133)
sns.scatterplot(x=LLG['Ic'],y=LLG['Vc'])

plt.tight_layout()
plt.show()

In [None]:
print("For Line Line Line Fault")
plt.subplots(1,3,figsize=(15,5))

plt.subplot(131)
sns.scatterplot(x=LLL['Ia'],y=LLL['Va'],color='red')

plt.subplot(132)
sns.scatterplot(x=LLL['Ib'],y=LLL['Vb'],color='green')

plt.subplot(133)
sns.scatterplot(x=LLL['Ic'],y=LLL['Vc'])

plt.tight_layout()
plt.show()

In [None]:
print("For Line Line Line Ground Fault")
plt.subplots(1,3,figsize=(15,5))

plt.subplot(131)
sns.scatterplot(x=LLLG['Ia'],y=LLLG['Va'],color='red')

plt.subplot(132)
sns.scatterplot(x=LLLG['Ib'],y=LLLG['Vb'],color='green')

plt.subplot(133)
sns.scatterplot(x=LLLG['Ic'],y=LLLG['Vc'])

plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(ds,hue='Output (S)')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn import metrics

## Detection dataset

In [None]:
for i in ['Va','Vb','Vc']:
    ds[i] = ds[i]*11000
    
ds.head()

In [None]:
dip = ds.drop(columns=['Output (S)'],axis=1)
dipc = dip.columns

mms = MinMaxScaler()
df_dip = mms.fit_transform(dip)

dip = pd.DataFrame(df_dip, columns=dipc)

In [None]:
dop = ds.iloc[:,0]

In [None]:
dip.describe()

In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dip, dop, test_size=0.25, random_state=67)

### SVM model

In [None]:
sv = SVC(C=1000)

sv.fit(X_train,y_train)
scores = cross_val_score(sv, X_test, y_test, cv=10)
print("Score:", np.mean(scores))
metrics.plot_confusion_matrix(sv,X_test,y_test)
plt.show()

### Decision Tree Model

In [None]:
dtc = DecisionTreeClassifier(criterion='gini',ccp_alpha=0.0012)

dtc.fit(X_train,y_train)
scores = cross_val_score(dtc, X_test, y_test, cv=10)
print("Score:", np.mean(scores),"\n")
metrics.plot_confusion_matrix(dtc,X_test,y_test)
plt.show()

Value *ccp_alpha* has been calculated via decision tree pruning.

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(15,7))
plot_tree(dtc,filled=True,feature_names=['Ia', 'Ib', 'Ic', 'Va', 'Vb', 'Vc'])
plt.show()

### KNN Model

Value of hyperparameters has been evaluated using GridSearchCV

In [None]:
knn = KNeighborsClassifier(leaf_size= 1, n_neighbors= 5,p= 1)
knn.fit(X_train,y_train)
scores = cross_val_score(knn, X_test, y_test, cv=10)
print("Score:", np.mean(scores),"\n")
metrics.plot_confusion_matrix(knn,X_test,y_test)
plt.show()

### Random Forest Classifier

In [None]:
# n_estimators = [10,20,30,40,50,60]
# max_features = ['auto','sqrt']
# max_depth = [3,4,5,6]
# min_samples_split = [5,10,15]
# min_samples_leaf = [5,10]

# param_grid = {"n_estimators": n_estimators,
#             "max_features": max_features,
#             "max_depth": max_depth,
#             "min_samples_split":min_samples_split,
#             "min_samples_leaf":min_samples_leaf}

# rf_grid=GridSearchCV(rf,param_grid=param_grid,verbose=2,n_jobs=4)
# rf_grid.fit(X_train,y_train)

# rf_grid.best_params_

# rf_grid.best_estimator_

In [None]:
rfclf = RandomForestClassifier(max_depth=6, min_samples_leaf=10, min_samples_split=15,n_estimators=10)
rfclf.fit(X_train,y_train)
scores=cross_val_score(rfclf, X_test, y_test, cv=10)
print("Score:", np.mean(scores),"\n")

In [None]:
metrics.plot_confusion_matrix(rfclf,X_test,y_test)
plt.show()

#### SVM is doing  a great job till now in Fault Detection, than the rest of the models because it's able to predict all the signals in most efficient manner while in other models there are cases where there is actually fault but the model is not able to identify it.