# IIC-2433 Minería de Datos UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- sklearn 1.0.2


Dataset en: https://www.kaggle.com/datasets/mathchi/diabetes-data-set?resource=download

In [1]:
import pandas as pd

dframe = pd.read_csv("diabetes.csv", encoding = "ISO-8859-1")
dframe.dropna(inplace=True)
dframe[dframe.isnull().any(axis=1)].size

0

### Tenemos varias características explicativas relacionadas con la diabetes.

In [2]:
dframe

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### Nos quedamos con X (hacemos drop de la variable objetivo)

In [3]:
x_df = dframe.drop(['Outcome'], axis=1)
x_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


### Vectorizamos el dataset porque es data tabular

In [4]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(x_df.to_dict("records"))
X.shape

(768, 8)

### Ahora tenemos el y

In [5]:
import numpy as np

y = dframe.Outcome.values
classes = np.unique(y)
y.shape

(768,)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=109, stratify=y)

### Vamos a usar Random Forest. Vamos a variar el número de árboles base y el tamaño mínimo de la muestra bootstrap para hacer un split.

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

range_T = [50, 100, 200, 500]

    
for t_ in range_T:
    clf = RandomForestClassifier(min_samples_split=50, n_estimators=t_, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    print(
        "For min_samples_split = 50 and t =",
        t_,
        "The accuracy is :",
        acc,
    )
    
for t_ in range_T:
    clf = RandomForestClassifier(min_samples_split=20, n_estimators=t_, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    print(
        "For min_samples_split = 20 and t =",
        t_,
        "The accuracy is :",
        acc,
    )
    
for t_ in range_T:
    clf = RandomForestClassifier(min_samples_split=10, n_estimators=t_, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    print(
        "For min_samples_split = 10 and t =",
        t_,
        "The accuracy is :",
        acc,
    )

    
for t_ in range_T:
    clf = RandomForestClassifier(min_samples_split=5, n_estimators=t_, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    print(
        "For min_samples_split = 5 and t =",
        t_,
        "The accuracy is :",
        acc,
    )

For min_samples_split = 50 and t = 50 The accuracy is : 0.7792207792207793
For min_samples_split = 50 and t = 100 The accuracy is : 0.7792207792207793
For min_samples_split = 50 and t = 200 The accuracy is : 0.7857142857142857
For min_samples_split = 50 and t = 500 The accuracy is : 0.7792207792207793
For min_samples_split = 20 and t = 50 The accuracy is : 0.7922077922077922
For min_samples_split = 20 and t = 100 The accuracy is : 0.7922077922077922
For min_samples_split = 20 and t = 200 The accuracy is : 0.7987012987012987
For min_samples_split = 20 and t = 500 The accuracy is : 0.7857142857142857
For min_samples_split = 10 and t = 50 The accuracy is : 0.8051948051948052
For min_samples_split = 10 and t = 100 The accuracy is : 0.7987012987012987
For min_samples_split = 10 and t = 200 The accuracy is : 0.7922077922077922
For min_samples_split = 10 and t = 500 The accuracy is : 0.7922077922077922
For min_samples_split = 5 and t = 50 The accuracy is : 0.7857142857142857
For min_samples_s

Ver parámetros en: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

### El mejor fue t=50 y min samples = 10.

In [8]:
clf = RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.8051948051948052

### Observamos que este tipo de clasificadores funcionan en data desbalanceada

In [9]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       100
           1       0.79      0.61      0.69        54

    accuracy                           0.81       154
   macro avg       0.80      0.76      0.77       154
weighted avg       0.80      0.81      0.80       154



! python -m pip install lime

### Ahora vamos a usar stacking. Es heterogeneo (AdaBoost + RandomForest). Voy a variar el número de learners de AdaBoost. RF queda fijo en T = 50.

In [10]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


estimators = [('rf', RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=42)), 
              ('ada-50', AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=50, random_state=0))]
    

clf = StackingClassifier(estimators=estimators)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(
        "For rf and ada-50 he accuracy is :",
        acc,
    )

estimators = [('rf', RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=42)), 
              ('ada-100', AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=100, random_state=0))]
    

clf = StackingClassifier(estimators=estimators)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(
        "For rf and ada-100 he accuracy is :",
        acc,
    )

estimators = [('rf', RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=42)), 
              ('ada-200', AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=200, random_state=0))]
    

clf = StackingClassifier(estimators=estimators)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(
        "For rf and ada-200 he accuracy is :",
        acc,
    )
    

For rf and ada-50 he accuracy is : 0.7857142857142857
For rf and ada-100 he accuracy is : 0.8051948051948052
For rf and ada-200 he accuracy is : 0.7857142857142857


### El mejor fue ada-100

In [11]:
estimators = [('rf', RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=42)), 
              ('ada-100', AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=100, random_state=0))]
    

clf = StackingClassifier(estimators=estimators)
clf.fit(X_train, y_train)