# Análise de propensão

## Base de Dados

### Escolha da base de dados

[Diabetic Data](https://www.kaggle.com/smit1212/diabetic-data-cleaning)



In [0]:
!wget -q -o /dev/null https://github.com/pgiaeinstein/aula-08/raw/master/diabetic-data.zip -O diabetic-data.zip 
!unzip diabetic-data.zip

In [0]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 99)
diabet_data = pd.read_csv("./diabetic_data.csv")
diabet_data.describe(include='all')

### Selecionar colunas de interesse

In [0]:

diabet_data.drop(columns=['encounter_id'], inplace=True)

### Corrigir tipo das variáveis

In [0]:

  diabet_data['admission_type_id'] = diabet_data['admission_type_id'].astype('category')

In [0]:

diabet_data.describe(include='all')

### Eliminar variáveis incompletas

In [0]:
diabet_data.drop(columns=['weight'],inplace=True)

In [0]:
diabet_data.describe(include='all')

## Kolmogorov-Smirnov

### One-sample

In [0]:
import numpy as np
import scipy
x = np.array([42,42,48,51,52,54,56,55,58,59,62,65,66,67,68,69,71,74,75,76,80,81,82,83,84,85,88,89,91,94])
mean = x.mean()
std = np.std(x, ddof=1)
print(scipy.stats.kstest(x, 'norm', args=(mean, std)))

In [0]:
import numpy as np
import scipy
x = np.array([42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,76,80,81,82,83,84,85,88,89,91,94])
mean = x.mean()
sd = np.std(x, ddof=1)
print(scipy.stats.kstest(x, 'norm',args = (mean, sd)))

### Two-sample

In [0]:
import numpy as np
import scipy
x = np.array([42,42,48,51,52,54,56,55,58,59,62,65,66,67,68,69,71,74,75,76,80,81,82,83,84,85,88,89,91,94])
y = np.array([10,42,48,51,52,54,56,20,58,59,62,65,66,67,68,50,71,74,75,76,80,81,82,83,84,85,88,89,91,100])
print(scipy.stats.ks_2samp(x, y))

In [0]:
import numpy as np
import scipy
x = np.array([42,42,48,51,52,54,56,55,58,59,62,65,66,67,68,69,71,74,75,76,80,81,82,83,84,85,88,89,91,94])
y = np.array([10,42,1,51,9,54,56,20,58,59,2,6,66,7,5,8,71,3,75,76,80,81,82,83,4,85,88,89,91,100])
print(scipy.stats.ks_2samp(x, y))

### Análise das distribuições dos resultados

In [0]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn import datasets
import numpy as np
import scipy

alpha = 0.05

[Base de dados de Câncer de mama](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)

In [0]:
dataset = datasets.load_breast_cancer()
print(dataset.DESCR)
X = dataset.data
y = dataset.target

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=123)
ks_x_train = np.transpose(X_train)
ks_x_test = np.transpose(X_test)
for i in range(len(ks_x_train)):
  ks = scipy.stats.ks_2samp(ks_x_train[i], ks_x_test[i])
  ks_statistic = ks[0]
  p_value = ks[1]
  if(p_value < alpha):
    print(i,ks)


#### Média dos scores
Teste com `seed=1` e depois com `seed=123`. Compare o resultado do KS entre as distribuições dos resultados RF e LF.

In [0]:
seed = 123

In [0]:
rf_scores = []
for i in range(30):
  rf = RandomForestClassifier(random_state=seed)
  rf.fit(X_train, y_train)
  rf_scores.append(rf.score(X_test,y_test))
rf_scores = np.array(rf_scores)
print("RF Score:",rf_scores.mean())

In [0]:
lr_scores = []
for i in range(30):
  lr = LogisticRegression(random_state=seed)
  lr.fit(X_train, y_train)
  lr_scores.append(lr.score(X_test,y_test))
lr_scores = np.array(lr_scores)
print("LR Score:",lr_scores.mean())

#### KS entre scores

In [0]:
print("RF/LR:")
print(scipy.stats.ks_2samp(rf_scores, lr_scores))

## Curva ROC

In [0]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [0]:
y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_rf)
auc_rf = auc(fpr_rf, tpr_rf)

In [0]:
y_pred_lr = lr.predict_proba(X_test)[:, 1]
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_pred_lr)
auc_lr = auc(fpr_lr, tpr_lr)

In [0]:
import matplotlib.pyplot as plt
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.plot(fpr_lr, tpr_lr, label='LR (area = {:.3f})'.format(auc_lr))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.savefig('roc.png')
# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.plot(fpr_lr, tpr_lr, label='LR (area = {:.3f})'.format(auc_lr))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.savefig('roc_upper_left.png')