# 1. Importando Bibliotecas e DataSet

In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
                              RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier
                              )
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [66]:
german = pd.read_csv('german_credit_data.csv')

# 2. Analisando DataSet e efetuando devidas alterações

In [67]:
german.head() #Verificando dados

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [68]:
#Eftuando a troca das informações de nossa coluna target (Risk): good = 0, bad = 1
german['Risk'] = np.where(german['Risk'] == 'bad', 1, 0)
german.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,2,49,male,1,own,little,,2096,12,education,0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,4,53,male,2,free,little,little,4870,24,car,1


In [69]:
#Separando somente as colunas a serem utilizadas, gerando assim um novo DataFrame
german = german[['Age', 'Credit amount', 'Duration', 'Risk']].copy()

In [70]:
german.head()

Unnamed: 0,Age,Credit amount,Duration,Risk
0,67,1169,6,0
1,22,5951,48,1
2,49,2096,12,0
3,45,7882,42,0
4,53,4870,24,1


In [71]:
features = german[['Age', 'Credit amount', 'Duration']]
labels = german['Risk']

In [72]:
features

Unnamed: 0,Age,Credit amount,Duration
0,67,1169,6
1,22,5951,48
2,49,2096,12
3,45,7882,42
4,53,4870,24
...,...,...,...
995,31,1736,12
996,40,3857,30
997,38,804,12
998,23,1845,45


In [73]:
labels

0      0
1      1
2      0
3      0
4      1
      ..
995    0
996    0
997    0
998    1
999    0
Name: Risk, Length: 1000, dtype: int64

In [74]:
#Criando treino e teste
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# 3. Gerando os Pipeline

In [75]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('imputer', SimpleImputer(strategy='median')),
                 ('clf', DecisionTreeClassifier())
                ])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.65

In [76]:
pipe.predict(X_test)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0])

# 4. Gerando Pipeline através do Make_Pipeline

In [77]:
make_pipe = make_pipeline(MinMaxScaler(),
                          SimpleImputer(strategy='mean'),
                          LogisticRegression()
                        )

In [78]:
make_pipe.fit(X_train, y_train)
make_pipe.score(X_test, y_test)

0.725

# 5. Utilizando Column Transformer

In [79]:
german = pd.read_csv("german_credit_data.csv")

In [80]:
german['Risk'] = np.where(german['Risk']=='bad', 1, 0)
german = german[['Age', 'Credit amount', 'Duration', 'Purpose','Risk']].copy()
german.head()

Unnamed: 0,Age,Credit amount,Duration,Purpose,Risk
0,67,1169,6,radio/TV,0
1,22,5951,48,radio/TV,1
2,49,2096,12,education,0
3,45,7882,42,furniture/equipment,0
4,53,4870,24,car,1


In [81]:
features = german[['Age', 'Credit amount', 'Duration', 'Purpose']]
labels = german['Risk']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [82]:
numericas_continuas = ['Age', 'Credit amount', 'Duration']
string_categoricas =['Purpose']

In [83]:
column_transformer = [('num_continuas', StandardScaler(), numericas_continuas), 
     ('str_categoricas', OneHotEncoder(), string_categoricas)]

preprocessor = ColumnTransformer(transformers=column_transformer)

In [84]:
pipe_transformer = Pipeline(steps=[('preprocessor', preprocessor), ('clf', DecisionTreeClassifier())])

In [85]:
pipe_transformer.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_continuas',
                                                  StandardScaler(),
                                                  ['Age', 'Credit amount',
                                                   'Duration']),
                                                 ('str_categoricas',
                                                  OneHotEncoder(),
                                                  ['Purpose'])])),
                ('clf', DecisionTreeClassifier())])

In [86]:
pipe_transformer.score(X_test, y_test)

0.61

# 6. Testando vários modelos através de um LOOPING

In [87]:
#Gerando lista de classificadores
classifiers_list = [
                    KNeighborsClassifier(3),
                    SVC(kernel="rbf", C=0.025, probability=True),
                    SVC(),
                    LogisticRegression(),
                    DecisionTreeClassifier(),
                    RandomForestClassifier(),
                    AdaBoostClassifier(),
                    GradientBoostingClassifier(),
                    ]

In [88]:
#Gerando Looping utilizando os classificadores e exibindo resultados para comparação dos resultados
for classifier in classifiers_list:
  pipe = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])
  pipe.fit(X_train, y_train)
  print(str(classifier) + str(" - Resultado Score do Modelo: %.3f" % pipe.score(X_test, y_test)))
KNeighborsClassifier(n_neighbors=3)

KNeighborsClassifier(n_neighbors=3) - Resultado Score do Modelo: 0.660
SVC(C=0.025, probability=True) - Resultado Score do Modelo: 0.705
SVC() - Resultado Score do Modelo: 0.720
LogisticRegression() - Resultado Score do Modelo: 0.715
DecisionTreeClassifier() - Resultado Score do Modelo: 0.640
RandomForestClassifier() - Resultado Score do Modelo: 0.705
AdaBoostClassifier() - Resultado Score do Modelo: 0.705
GradientBoostingClassifier() - Resultado Score do Modelo: 0.710


KNeighborsClassifier(n_neighbors=3)