# Classificação

In [None]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

%matplotlib inline

## Leitura dos dados

In [None]:
# https://www.kaggle.com/uciml/pima-indians-diabetes-database
diabetes = pd.read_csv('diabetes.csv')

In [None]:
diabetes.head()

### Separando em X (features) e y (variável resposta - target)

In [None]:
X = diabetes.drop('Outcome', axis=1)
y = diabetes['Outcome']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
y.value_counts()

In [None]:
y.hist()

### Separando os dados em treinamento e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0) 

In [None]:
y_test.value_counts(normalize=True)

## Classificador

In [None]:
clf = DecisionTreeClassifier()
# clf = DecisionTreeClassifier(max_depth=5)

### Treinando o classificador

In [None]:
clf = clf.fit(X_train,y_train)

### Predição no conjunto de teste

In [None]:
y_pred = clf.predict(X_test)

### Métrica de avaliação

In [None]:
print("Acurácia:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print('Acurácia no conjunto de treinamento: {:.2f}'.format(clf.score(X_train, y_train)))
print('Acurácia no conjunto de teste: {:.2f}'.format(clf.score(X_test, y_test)))

In [None]:
print(metrics.classification_report(y_test, y_pred))

## Escolhendo um segundo classificador

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
print('Acurácia no conjunto de treinamento: {:.2f}'.format(clf.score(X_train, y_train)))
print('Acurácia no conjunto de teste: {:.2f}'.format(clf.score(X_test, y_test)))

In [None]:
y_pred = clf.predict(X_test)
print("Acurácia:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(metrics.classification_report(y_test, y_pred))

## Outro dataset para classificação 

In [None]:
valores = [['Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
       ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['Sunny', 'Mild', 'High', 'Weak', 'No'],
       ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
       ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'High', 'Strong', 'No']]

colunas = ['outlook', 'temp', 'humidity', 'wind', 'play']

### Separando em X (features) e y (variável resposta - target)

In [None]:
dataset = pd.DataFrame(data=valores, columns=colunas)
dataset

In [None]:
X = dataset.drop(['play'], axis=1)
y = dataset['play']

## Meu classificador

In [None]:
class NaiveBayes:
    def __init__(self):
        pass
    
    def predict(self, X, y, x):
        #prob_class = np.ones(len(y.unique()))
        prob_class = np.zeros(len(y.unique()))
        for i, class_ in enumerate(y.unique()):
            #prob_class[i] = prob_class[i] * sum(y == class_)/len(y)
            prob_class[i] = prob_class[i] + math.log(sum(y == class_)/len(y))
            for j, v in enumerate(x):
                if v != "?":
                    prob_v = sum((y == class_) & (X.iloc[:,j] == v))/sum(y == class_)
                    if prob_v == 0:
                        prob_v = min_prob
                    #prob_class[i] = prob_class[i] * prob_v
                    prob_class[i] = prob_class[i] + math.log(prob_v)

        for i in range(len(prob_class)):
            prob_class[i] = math.exp(prob_class[i])
        classes = y.unique()
        result = prob_class/sum(prob_class)
        return result

In [None]:
nb = NaiveBayes()

### Predição

In [None]:
x = pd.Series(['?', 'Hot', 'High', '?'])
x

In [None]:
y.unique()

In [None]:
nb.predict(X, y, x)

## Classificador do scikit learn

In [None]:
data = dataset.copy()

In [None]:
data.head()

In [None]:
o = {'Sunny': 1, 'Overcast': 2, 'Rain': 3}
data.outlook = [o[item] for item in data.outlook.astype(str)]

t = {'Hot': 1, 'Mild': 2, 'Cool': 3}
data.temp = [t[item] for item in data.temp.astype(str)]

h = {'High': 1, 'Normal': 2}
data.humidity = [h[item] for item in data.humidity.astype(str)]

w = {'Strong': 1, 'Weak': 2}
data.wind = [w[item] for item in data.wind.astype(str)]

In [None]:
data

### Separando em X (features) e y (variável resposta - target)

In [None]:
X = data.drop('play', axis=1)
y = data['play']

In [None]:
y.value_counts()

In [None]:
y.value_counts(normalize=True)

### Separando os dados em treinamento e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=5) 

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Acurácia:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(metrics.classification_report(y_test, y_pred))