In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn 
import imblearn
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_recall_fscore_support, roc_auc_score

In [None]:
#Ignorando avisos
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Formatação
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold= 15)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [None]:
df = pd.read_csv('Train_Test_IoT_Weather.csv')
df

Unnamed: 0,ts,date,time,temperature,pressure,humidity,label,type
0,1556238796,25-Apr-19,17:33:16,40.881866,-0.101806,38.363631,1,ddos
1,1556238796,25-Apr-19,17:33:16,44.913806,2.661616,46.141423,1,ddos
2,1556238796,25-Apr-19,17:33:16,38.295822,-2.438871,50.850643,1,ddos
3,1556238801,25-Apr-19,17:33:21,41.306586,-0.101806,38.363631,1,ddos
4,1556238801,25-Apr-19,17:33:21,44.903178,3.080849,46.141423,1,ddos
...,...,...,...,...,...,...,...,...
59255,1554166618,1-Apr-19,17:56:58,32.799434,2.204924,37.024913,0,normal
59256,1554166618,1-Apr-19,17:56:58,29.453781,-2.030547,90.297894,0,normal
59257,1554166619,1-Apr-19,17:56:59,47.185992,0.872942,37.687701,0,normal
59258,1554166623,1-Apr-19,17:57:03,43.097037,3.168207,93.647950,0,normal


In [None]:
# Substituindo os espaços em branco na coluna 'time'
df['time'] = df['time'].str.replace(' ', '')

In [None]:
df['hour'] = ''
df['minute'] = ''
df['second'] = ''

In [None]:
df[['hour', 'minute', 'second']] = df['time'].str.split(':', expand=True)

In [None]:
df['hour'] = df['hour'].astype(int)
df['minute'] = df['minute'].astype(int)
df['second'] = df['second'].astype(int)

In [None]:
df.head()

Unnamed: 0,ts,date,time,temperature,pressure,humidity,label,type,hour,minute,second
0,1556238796,25-Apr-19,17:33:16,40.881866,-0.101806,38.363631,1,ddos,17,33,16
1,1556238796,25-Apr-19,17:33:16,44.913806,2.661616,46.141423,1,ddos,17,33,16
2,1556238796,25-Apr-19,17:33:16,38.295822,-2.438871,50.850643,1,ddos,17,33,16
3,1556238801,25-Apr-19,17:33:21,41.306586,-0.101806,38.363631,1,ddos,17,33,21
4,1556238801,25-Apr-19,17:33:21,44.903178,3.080849,46.141423,1,ddos,17,33,21


## Ajuste de dados

In [None]:
# Teste sem a feature de TS. 
df.drop(['ts', 'type', 'time'], axis=1, inplace=True)

# Criando outras features usando a data (day-month-year)
df.date = pd.to_datetime(df.date)
df['day'] = df.date.dt.day
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year

df.drop(labels=['date'], inplace=True, axis=1)
df.head()

Unnamed: 0,temperature,pressure,humidity,label,hour,minute,second,day,month,year
0,40.881866,-0.101806,38.363631,1,17,33,16,25,4,2019
1,44.913806,2.661616,46.141423,1,17,33,16,25,4,2019
2,38.295822,-2.438871,50.850643,1,17,33,16,25,4,2019
3,41.306586,-0.101806,38.363631,1,17,33,21,25,4,2019
4,44.903178,3.080849,46.141423,1,17,33,21,25,4,2019


### Ajustando dados X e y

In [None]:
X = df[['temperature','pressure', 'humidity','day', 'hour', 'minute', 'second']]
y = df['label']

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59260 entries, 0 to 59259
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   temperature  59260 non-null  float64
 1   pressure     59260 non-null  float64
 2   humidity     59260 non-null  float64
 3   label        59260 non-null  int64  
 4   hour         59260 non-null  int32  
 5   minute       59260 non-null  int32  
 6   second       59260 non-null  int32  
 7   day          59260 non-null  int64  
 8   month        59260 non-null  int64  
 9   year         59260 non-null  int64  
dtypes: float64(3), int32(3), int64(4)
memory usage: 3.8 MB


In [None]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
y = LabelEncoder().fit_transform(y)

In [None]:
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'int32', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

In [None]:
numerical_ix

Index(['temperature', 'pressure', 'humidity', 'day', 'hour', 'minute',
       'second'],
      dtype='object')

In [None]:
categorical_ix

Index([], dtype='object')

In [None]:
def classification_report_with_accuracy_score(y_true, y_pred):
    print (classification_report(y_true, y_pred) )
    return accuracy_score(y_true, y_pred)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn import tree

est_RL  = LogisticRegression(C = 0.1, 
                             class_weight = None, 
                             max_iter = 100, 
                             multi_class = 'ovr', 
                             penalty= 'l1', 
                             solver = 'saga')

est_KNN = KNeighborsClassifier(metric = 'euclidean', 
                               n_neighbors = 1)


est_GBM = GradientBoostingClassifier(learning_rate = 0.1, 
                                     max_depth = 2, 
                                     max_features = 'auto', 
                                     min_samples_leaf = 1, 
                                     min_samples_split = 2, 
                                     n_estimators = 50, 
                                     subsample = 0.8)

est_NB  = BernoulliNB(alpha = 0.1)

est_LDA = LinearDiscriminantAnalysis(solver = 'svd')

est_DTC = DecisionTreeClassifier(class_weight =  None, 
                                 criterion = 'gini', 
                                 max_depth = 2, 
                                 max_features = 'sqrt', 
                                 min_samples_leaf = 1, 
                                 min_samples_split = 2)

est_RF  = RandomForestClassifier(bootstrap = True, 
                                 max_depth = 3, 
                                 max_features = 'sqrt', 
                                 min_samples_leaf = 1,
                                 min_samples_split =2, 
                                 n_estimators = 100)

est_Ensemble = VotingClassifier(estimators=[('RL', est_RL), ('KNN', est_KNN), ('GBM', est_GBM), ('NB', est_NB), ('LDA', est_LDA), ('DTC', est_DTC), ('RF', est_RF)],
                        voting='soft')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

clf = make_pipeline(est_Ensemble)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, accuracy_score

# Realize a validação cruzada e obtenha as previsões do modelo
y_pred = cross_val_predict(clf, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))

# Calcule e exiba o classification report para cada estimador
for name, estimator in est_Ensemble.estimators:
    estimator.fit(X, y)  # Treine o estimador
    y_pred_estimator = estimator.predict(X)  # Faça previsões
    report = classification_report(y, y_pred_estimator)
    print(f"Estimador: {name}")
    print(report)
    print("---")

# Encontre o estimador com o melhor desempenho
best_estimator_name = None
best_estimator_score = 0.0

for name, estimator in est_Ensemble.estimators:
    estimator.fit(X, y)  # Treine o estimador
    y_pred_estimator = estimator.predict(X)  # Faça previsões
    accuracy = accuracy_score(y, y_pred_estimator)
    if accuracy > best_estimator_score:
        best_estimator_score = accuracy
        best_estimator_name = name

# Exiba o melhor estimador
print("Melhor estimador: ", best_estimator_name)
print("Pontuação de acurácia: ", best_estimator_score)


Estimador: RL
              precision    recall  f1-score   support

           0       0.73      0.97      0.83     35000
           1       0.92      0.49      0.63     24260

    accuracy                           0.77     59260
   macro avg       0.82      0.73      0.73     59260
weighted avg       0.81      0.77      0.75     59260

---
Estimador: KNN
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35000
           1       1.00      1.00      1.00     24260

    accuracy                           1.00     59260
   macro avg       1.00      1.00      1.00     59260
weighted avg       1.00      1.00      1.00     59260

---
Estimador: GBM
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35000
           1       1.00      1.00      1.00     24260

    accuracy                           1.00     59260
   macro avg       1.00      1.00      1.00     59260
weighted avg       1.0