# Predição Machine Learning 

* ## Importando bibliotecas e dados

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_style("darkgrid")

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/nicksonrock/machinelearninguff/main/Base%20de%20Dados%20-%20Machine%20Learning_UFF%20-%20Manuten%C3%A7%C3%A3o%20Preditiva.csv')

In [None]:
data.head()

In [None]:
data.info()

# Reprocessando dados

As colunas UDI e ID do Produto não carregam informações úteis para a análise.

In [None]:
data = data.drop(["UDI",'ID Produto'],axis=1)
data.head(3)

# Quantidade Total de falhas

In [None]:
data.groupby(['Falha', 'Tipo de falha']).count().drop(['Temperatura do Processo [K]',
                                                      'Velocidade de Rotação [rpm]',
                                                     'Torque [Nm]',
                                                     'Tempo de uso [min]',
                                                      'Temperatura do Ar [K]'],axis=1).rename(columns = {'Type':'count'})

# Visualização dos dados

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=data,x="Falha")

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=data[data['Falha']==1],x="Tipo de falha")

In [None]:
sns.pairplot(data,hue='Falha')

### Focaremos no Target Detection e ignorar o tipo de erro.

In [None]:
data.columns

In [None]:
plt.figure(figsize = (20,15))
m=1
for i in ['Temperatura do Ar [K]', 'Temperatura do Processo [K]','Velocidade de Rotação [rpm]', 'Torque [Nm]', 'Tempo de uso [min]'] :
    plt.subplot(3,2,m)
    sns.boxplot(data=data,y=i,x="Tipo",hue="Falha")
    m+=1

In [None]:
import plotly.express as px

fig = px.scatter_3d(data, x='Tempo de uso [min]', y='Torque [Nm]', z='Velocidade de Rotação [rpm]',
              color='Falha')
fig.show()

In [None]:
def feat_prob(feature,data):
    x,y = [],[]
    for j in data[feature].unique():
        temp = data
        temp = temp[temp[feature]>=j]
        y.append(round((temp.Falha.mean()*100),2))
        x.append(j)
    return(x,y)

In [None]:
plt.figure(figsize=(15,17))
m=1
for i in ['Temperatura do Ar [K]', 'Temperatura do Processo [K]','Velocidade de Rotação [rpm]', 'Torque [Nm]', 'Tempo de uso [min]'] :
    plt.subplot(3,2,m).set_title(label=("Probabilidade de falha wrt "+i))
    x,y = feat_prob(i,data)
    plt.xlabel(i)
    plt.ylabel("Probabilidade de falha (%)")
    sns.lineplot(y=y,x=x)
    m+=1

# Pré Processamento para predição

## Codificação

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

label_encoder.fit(data['Tipo'])
data['Type'] = label_encoder.transform(data['Tipo'])

label_encoder.fit(data['Falha'])
data['Falha'] = label_encoder.transform(data['Falha'])

In [None]:
data.tail()

data = data.rename(columns = {'Air temperature [K]':'airtemp',
                      'Process temperature [K]':'processtemp',
                      'Rotational speed [rpm]':'rpm',
                      'Torque [Nm]':'torque',
                      'Tool wear [min]':'toolwear'})

## Teste / Treino


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_Treino, X_teste, Y_Treino, y_teste = train_test_split(data.drop(['Tipo de falha','Falha'],axis=1),
                                                    data['Falha'], test_size=0.3, random_state=42)

## Predição

In [None]:
import time

from sklearn.metrics import accuracy_score,classification_report
classifier=[]
imported_as=[]

#LGBM
import lightgbm as lgb
lgbm = lgb.LGBMClassifier()
classifier.append('LightGBM')
imported_as.append('lgbm')

#MultiLayerPerceptron
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier()
classifier.append('Multi Layer Perceptron')
imported_as.append('mlp')

#Bagging
from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier()
classifier.append('Bagging')
imported_as.append('bc')

#GBC
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
classifier.append('Gradient Boosting')
imported_as.append('gbc')

#ADA
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
classifier.append('Ada Boost')
imported_as.append('ada')

#XGB
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier() 
classifier.append('XG Boost')
imported_as.append('xgb')

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
classifier.append('Logistic Regression')
imported_as.append('lr')

#RFC
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
classifier.append('Random Forest')
imported_as.append('rfc')

#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
classifier.append('k Nearest Neighbours')
imported_as.append('knn')

#SVM
from sklearn.svm import SVC
svc = SVC()
classifier.append('Support Vector Machine')
imported_as.append('svc')

#Grid
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000,2000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
classifier.append('SVM tuning grid')
imported_as.append('grid')

#STcaking
from sklearn.ensemble import StackingClassifier
estimators=[('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
            ('svr',SVC(random_state=42))]
stc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
classifier.append('Stacked (RFR & SVM)')
imported_as.append('stc')

classifiers = pd.DataFrame({'Classifier':classifier,'Imported as':imported_as})
print('All Models Imported\nModels stored in dataframe called classifiers')

In [None]:
class Modelling:
    def __init__(self, X_Treino, Y_Treino, X_teste, Y_teste, models):
        self.X_Treino = X_Treino
        self.X_teste = X_teste
        self.Y_Treino = Y_Treino
        self.Y_teste = Y_teste
        self.models = models
    
    def fit(self):
        model_acc = []
        model_time= []
        for i in self.models:
            start=time.time()
            if i == 'knn':
                accuracy = []
                for j in range(1,200):    
                    kn = KNeighborsClassifier(n_neighbors=j)
                    kn.fit(self.X_Treino,self.Y_Treino)
                    predK = kn.predict(self.X_teste)
                    accuracy.append([accuracy_score(self.Y_test,predK),j])
                temp = accuracy[0]
                for m in accuracy:
                    if temp[0] < m[0]:
                        temp=m
                i = KNeighborsClassifier(n_neighbors=temp[1])
            i.fit(self.X_Treino,self.Y_Treino)
            model_acc.append(accuracy_score(self.Y_teste,i.predict(self.X_teste)))
            stop=time.time()
            model_time.append((stop-start))
            print(i,'has been fit')
        self.models_output = pd.DataFrame({'Models':self.models,'Accuracy':model_acc,'Runtime (s)':model_time})
        
    def results(self):
        models=self.models_output
        models = models.sort_values(by=['Accuracy','Runtime (s)'],ascending=[False,True]).reset_index().drop('index',axis=1)
        self.best = models['Models'][0]
        models['Models']=models['Models'].astype(str).str.split("(", n = 2, expand = True)[0]
        models['Accuracy']=models['Accuracy'].round(5)*100
        self.models_output_cleaned=models
        return(models)
        
    def best_model(self,type):
        if type=='model':
            return(self.best)
        elif type=='name':
            return(self.models_output_cleaned['Models'][0])
    
    def best_model_accuracy(self):
        return(self.models_output_cleaned['Accuracy'][0])
    
    def best_model_runtime(self):
        return(round(self.models_output_cleaned['Runtime (s)'][0],3))
    
    def best_model_predict(self,X_teste):
        return(self.best.predict(X_teste))
    
    def best_model_clmatrix(self):
        return(classification_report(self.Y_teste,self.best.predict(self.X_teste)))

In [None]:
display(classifiers)

In [None]:
models_to_test = [bc,gbc,ada,rfc,mlp,lr,knn,stc]

In [None]:
X_Treino.info()

In [None]:
classification = Modelling(X_Treino,Y_Treino,X_teste,y_teste,models_to_test)
classification.fit()

In [None]:
classification.results()

In [None]:
print('O Melhor modelo é:',  classification.best_model(type='name'))
print('Acurácia do modelo:',classification.best_model_accuracy())
print('Tempo de treino (s) ',classification.best_model_runtime())
print('Matriz de Classificação:\n')
print(classification.best_model_clmatrix())

In [None]:
sns.set_style("darkgrid", {"grid.color": "1", "grid.linestyle": " "})

from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(classification.best_model(type='model'), X_teste, y_teste) 