In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelPropagation 
import pickle

---

In [2]:
data = pd.read_csv('data/dados_diabetes.csv')
data.head()

Unnamed: 0,glicemia,pressao_sanguinea,dobra_cutanea_triceps,insulina,imc,idade,diabetes
0,89,66,23,94,28.1,21,
1,137,40,35,168,43.1,33,
2,78,50,32,88,31.0,26,sim
3,197,70,45,543,30.5,53,
4,189,60,23,846,30.1,59,sim


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   glicemia               394 non-null    int64  
 1   pressao_sanguinea      394 non-null    int64  
 2   dobra_cutanea_triceps  394 non-null    int64  
 3   insulina               394 non-null    int64  
 4   imc                    394 non-null    float64
 5   idade                  394 non-null    int64  
 6   diabetes               265 non-null    object 
dtypes: float64(1), int64(5), object(1)
memory usage: 21.7+ KB


In [4]:
data.describe()

Unnamed: 0,glicemia,pressao_sanguinea,dobra_cutanea_triceps,insulina,imc,idade
count,394.0,394.0,394.0,394.0,394.0,394.0
mean,122.304569,70.654822,29.106599,155.548223,32.988579,30.814721
std,31.396725,12.469919,10.504273,118.775855,7.21016,10.198971
min,0.0,24.0,7.0,14.0,0.0,21.0
25%,99.0,62.0,21.0,76.25,28.325,23.0
50%,119.0,70.0,29.0,125.0,33.2,27.0
75%,143.0,78.0,36.75,190.0,37.075,36.0
max,198.0,110.0,63.0,846.0,67.1,81.0


In [5]:
data['diabetes'].value_counts(dropna=False)

diabetes
nao    173
NaN    129
sim     92
Name: count, dtype: int64

---

In [6]:
labeled_df = data.dropna(subset='diabetes').copy()
unlabeled_df = data[data['diabetes'].isna()].copy()

In [7]:
labeled_df

Unnamed: 0,glicemia,pressao_sanguinea,dobra_cutanea_triceps,insulina,imc,idade,diabetes
2,78,50,32,88,31.0,26,sim
4,189,60,23,846,30.1,59,sim
5,166,72,19,175,25.8,51,sim
7,103,30,38,83,43.3,33,nao
8,115,70,30,96,34.6,32,sim
...,...,...,...,...,...,...,...
388,121,78,39,74,39.0,28,nao
390,128,88,39,110,36.5,37,sim
391,88,58,26,16,28.4,22,nao
392,101,76,48,180,32.9,63,nao


In [8]:
unlabeled_df

Unnamed: 0,glicemia,pressao_sanguinea,dobra_cutanea_triceps,insulina,imc,idade,diabetes
0,89,66,23,94,28.1,21,
1,137,40,35,168,43.1,33,
3,197,70,45,543,30.5,53,
6,118,84,47,230,45.8,31,
11,125,70,26,115,31.1,41,
...,...,...,...,...,...,...,...
373,149,68,29,127,29.3,42,
382,102,44,20,94,30.8,26,
384,153,88,37,140,40.6,39,
386,81,74,41,57,46.3,32,


In [9]:
X = labeled_df.drop(columns='diabetes')
y = labeled_df['diabetes']

In [10]:
le = LabelEncoder()
y = le.fit_transform(y)
le.inverse_transform([0, 1])

array(['nao', 'sim'], dtype=object)

In [11]:
scaler = MinMaxScaler()
X_norm = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_norm

Unnamed: 0,glicemia,pressao_sanguinea,dobra_cutanea_triceps,insulina,imc,idade
0,0.076923,0.263158,0.446429,0.088942,0.291771,0.119048
1,0.930769,0.394737,0.285714,1.000000,0.269327,0.904762
2,0.753846,0.552632,0.214286,0.193510,0.162095,0.714286
3,0.269231,0.000000,0.553571,0.082933,0.598504,0.285714
4,0.361538,0.526316,0.410714,0.098558,0.381546,0.261905
...,...,...,...,...,...,...
260,0.407692,0.631579,0.571429,0.072115,0.491272,0.166667
261,0.461538,0.763158,0.571429,0.115385,0.428928,0.380952
262,0.153846,0.368421,0.339286,0.002404,0.226933,0.023810
263,0.253846,0.605263,0.732143,0.199519,0.339152,1.000000


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, stratify=y)

In [13]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        44
           1       0.61      0.61      0.61        23

    accuracy                           0.73        67
   macro avg       0.70      0.70      0.70        67
weighted avg       0.73      0.73      0.73        67



---

In [14]:
unlabeled_df.drop(columns='diabetes', inplace=True)
unlabeled_df_norm = pd.DataFrame(scaler.transform(unlabeled_df), columns=unlabeled_df.columns)

In [15]:
y_pred = model.predict(unlabeled_df_norm)
new_X_train = pd.concat([X_train, unlabeled_df_norm], ignore_index = True)
new_y_train = pd.concat([pd.Series(y_train), pd.Series(y_pred)], ignore_index = True)

In [16]:
pseudo_labeling = SVC(kernel = 'linear', random_state = 10)
pseudo_labeling.fit(new_X_train, new_y_train)
new_y_pred = pseudo_labeling.predict(X_test)
print(classification_report(y_test, new_y_pred))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77        44
           1       0.57      0.57      0.57        23

    accuracy                           0.70        67
   macro avg       0.67      0.67      0.67        67
weighted avg       0.70      0.70      0.70        67



---

In [17]:
svm = SVC(kernel = 'linear', probability = True, random_state = 10)
svm.fit(X_train, y_train)

In [18]:
probabilidades = svm.predict_proba(unlabeled_df_norm)
y_previsto = svm.predict(unlabeled_df_norm)

In [19]:
tabela_resultados = pd.DataFrame(probabilidades, columns = ['Prob C0', 'Prob C1'])
tabela_resultados['Previsão'] = y_previsto
tabela_resultados['Probabilidade máxima'] = tabela_resultados[['Prob C0', 'Prob C1']].max(axis = 1)

In [20]:
filtro_confianca = tabela_resultados['Probabilidade máxima'] >= 0.75
novo_x_treino = pd.concat([X_train, unlabeled_df_norm[filtro_confianca]])
novo_y_treino = pd.concat([pd.Series(y_train), tabela_resultados.loc[filtro_confianca, 'Previsão']])

In [21]:
svm = SVC(kernel = 'linear', probability = True, random_state = 10)
svm.fit(novo_x_treino, novo_y_treino)

In [22]:
novo_y_previsto = svm.predict(X_test)
resultados_self_training1 = classification_report(y_test, novo_y_previsto)
print(resultados_self_training1)

              precision    recall  f1-score   support

           0       0.79      0.75      0.77        44
           1       0.56      0.61      0.58        23

    accuracy                           0.70        67
   macro avg       0.67      0.68      0.68        67
weighted avg       0.71      0.70      0.70        67



In [23]:
x_treino_self_training = pd.concat([X_train, unlabeled_df_norm], ignore_index = True)
y_treino_self_training = pd.concat([pd.Series(y_train), pd.Series([-1]*unlabeled_df_norm.shape[0])], ignore_index = True)

In [24]:
svm = SVC(kernel = 'linear', probability= True, random_state = 10)
self_training = SelfTrainingClassifier(svm, threshold = 0.80, max_iter = 15, verbose = True)
self_training.fit(x_treino_self_training, y_treino_self_training)

End of iteration 1, added 76 new labels.
End of iteration 2, added 11 new labels.


In [25]:
y_previsto = self_training.predict(X_test)
resultados_self_training = classification_report(y_test, y_previsto)
print(resultados_self_training)

              precision    recall  f1-score   support

           0       0.79      0.75      0.77        44
           1       0.56      0.61      0.58        23

    accuracy                           0.70        67
   macro avg       0.67      0.68      0.68        67
weighted avg       0.71      0.70      0.70        67



In [27]:
x_treino_self_training = pd.concat([X_train, unlabeled_df_norm], ignore_index = True)
y_treino_self_training = pd.concat([pd.Series(y_train), pd.Series([-1]*unlabeled_df_norm.shape[0])], ignore_index = True)

In [28]:
label_propagation = LabelPropagation()
label_propagation.fit(x_treino_self_training, y_treino_self_training)

In [31]:
y_previsto = label_propagation.predict(X_test)
resultados_label_propagation = classification_report(y_test, y_previsto)
print(resultados_label_propagation)

              precision    recall  f1-score   support

           0       0.76      0.89      0.82        44
           1       0.69      0.48      0.56        23

    accuracy                           0.75        67
   macro avg       0.73      0.68      0.69        67
weighted avg       0.74      0.75      0.73        67



In [35]:
with open('min_max_scaler_2.pkl', 'wb') as arquivo:
    pickle.dump(scaler, arquivo)

In [36]:
with open('modelo_self_training_2.pkl', 'wb') as arquivo:
    pickle.dump(self_training, arquivo)

In [38]:
modelo_min_max = pd.read_pickle('min_max_scaler_2.pkl')
modelo_self_training = pd.read_pickle('modelo_self_training_2.pkl') 

In [39]:
novo_dado = {
    'glicemia':[98],
    'pressao_sanguinea':[75],
    'dobra_cutanea_triceps':[29],
    'insulina':[124],
    'imc':[24.3],
    'idade':[34]
}
novo_dado = pd.DataFrame(novo_dado) 

In [40]:
novo_dado = modelo_min_max.transform(novo_dado)
modelo_self_training.predict(novo_dado)



array([0])

In [42]:
le.inverse_transform([0])

array(['nao'], dtype=object)