In [None]:
df_sub_one_hot = df_sub_one_hot.query("cat_periodo==2014")

In [6]:
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np

data_train, data_test, ytrain, ytest = train_test_split(df_sub_one_hot, df_sub_one_hot['Class'], test_size=0.3, stratify=df_sub_one_hot['Class'])
diferencia = ytrain.value_counts()[False]-ytrain.value_counts()[True]
data_train = data_train.reset_index(drop=True)

# oversampling sobre la clase True
idx = np.random.choice(data_train[data_train['Class'] == True].index, size=diferencia)
data_oversampled = pd.concat([data_train, data_train.iloc[idx]])
print("Data oversampled on class 'True'")
print(data_oversampled['Class'].value_counts())

# subsampling sobre la clase False
idx = np.random.choice(data_train.loc[data_train.Class == False].index, size=diferencia, replace=False)
data_subsampled = data_train.drop(data_train.iloc[idx].index)
print("Data subsampled on class 'False'")
print(data_subsampled['Class'].value_counts())

Data oversampled on class 'True'
True     138643
False    138643
Name: Class, dtype: int64
Data subsampled on class 'False'
True     16488
False    16488
Name: Class, dtype: int64


In [7]:
from sklearn.metrics import classification_report

# datos test (mismo para todos los conjuntos de entrenamiento)
X_test = data_test[data_train.columns[:-1]]
y_test = data_test[data_train.columns[-1]]

# datos entrenamiento "originales"
X_orig = data_train[data_train.columns[:-1]] 
y_orig = data_train[data_train.columns[-1]] 

# datos entrenamiento "oversampleados" 
X_over = data_oversampled[data_train.columns[:-1]]
y_over = data_oversampled[data_train.columns[-1]]

# datos entrenamiento "subsampleados"
X_subs = data_subsampled[data_train.columns[:-1]]
y_subs = data_subsampled[data_train.columns[-1]]

In [8]:
from sklearn.tree import DecisionTreeClassifier

print("ORIGINAL::::::::::")
clf_orig = DecisionTreeClassifier()
clf_orig.fit(X_orig, y_orig)
pred_orig = clf_orig.predict(X_test)
print(classification_report(y_test, pred_orig))

print("OVERSAMPLING::::::::::")
clf_over = DecisionTreeClassifier()
clf_over.fit(X_over, y_over)
pred_over = clf_over.predict(X_test)
print(classification_report(y_test, pred_over))

print("SUBSAMPLING::::::::::")
clf_subs = DecisionTreeClassifier()
clf_subs.fit(X_subs, y_subs)
pred_subs = clf_subs.predict(X_test)
print(classification_report(y_test, pred_subs))

ORIGINAL::::::::::
              precision    recall  f1-score   support

       False       0.98      0.99      0.99     59419
        True       0.95      0.86      0.91      7067

    accuracy                           0.98     66486
   macro avg       0.97      0.93      0.95     66486
weighted avg       0.98      0.98      0.98     66486

OVERSAMPLING::::::::::
              precision    recall  f1-score   support

       False       1.00      0.96      0.98     59419
        True       0.73      0.96      0.83      7067

    accuracy                           0.96     66486
   macro avg       0.86      0.96      0.90     66486
weighted avg       0.97      0.96      0.96     66486

SUBSAMPLING::::::::::
              precision    recall  f1-score   support

       False       0.99      0.96      0.98     59419
        True       0.73      0.96      0.83      7067

    accuracy                           0.96     66486
   macro avg       0.86      0.96      0.90     66486
weighted a

In [9]:
from sklearn.naive_bayes import GaussianNB  # Naive bayes

print("ORIGINAL::::::::::")
clf_orig = GaussianNB()
clf_orig.fit(X_orig, y_orig)
pred_orig = clf_orig.predict(X_test)
print(classification_report(y_test, pred_orig))

print("OVERSAMPLING::::::::::")
clf_over = GaussianNB()
clf_over.fit(X_over, y_over)
pred_over = clf_over.predict(X_test)
print(classification_report(y_test, pred_over))

print("SUBSAMPLING::::::::::")
clf_subs = GaussianNB()
clf_subs.fit(X_subs, y_subs)
pred_subs = clf_subs.predict(X_test)
print(classification_report(y_test, pred_subs))

ORIGINAL::::::::::
              precision    recall  f1-score   support

       False       1.00      0.81      0.89     59419
        True       0.38      0.97      0.55      7067

    accuracy                           0.83     66486
   macro avg       0.69      0.89      0.72     66486
weighted avg       0.93      0.83      0.86     66486

OVERSAMPLING::::::::::
              precision    recall  f1-score   support

       False       1.00      0.80      0.89     59419
        True       0.37      0.98      0.54      7067

    accuracy                           0.82     66486
   macro avg       0.69      0.89      0.72     66486
weighted avg       0.93      0.82      0.85     66486

SUBSAMPLING::::::::::
              precision    recall  f1-score   support

       False       1.00      0.80      0.89     59419
        True       0.37      0.98      0.54      7067

    accuracy                           0.82     66486
   macro avg       0.69      0.89      0.72     66486
weighted a

In [10]:
from sklearn.neighbors import KNeighborsClassifier

print("ORIGINAL::::::::::")
clf_orig = KNeighborsClassifier()
clf_orig.fit(X_orig, y_orig)
pred_orig = clf_orig.predict(X_test)
print(classification_report(y_test, pred_orig))

print("OVERSAMPLING::::::::::")
clf_over = KNeighborsClassifier()
clf_over.fit(X_over, y_over)
pred_over = clf_over.predict(X_test)
print(classification_report(y_test, pred_over))

print("SUBSAMPLING::::::::::")
clf_subs = KNeighborsClassifier()
clf_subs.fit(X_subs, y_subs)
pred_subs = clf_subs.predict(X_test)
print(classification_report(y_test, pred_subs))

ORIGINAL::::::::::
              precision    recall  f1-score   support

       False       0.98      0.99      0.99     59419
        True       0.93      0.84      0.88      7067

    accuracy                           0.98     66486
   macro avg       0.95      0.92      0.93     66486
weighted avg       0.98      0.98      0.98     66486

OVERSAMPLING::::::::::
              precision    recall  f1-score   support

       False       0.99      0.95      0.97     59419
        True       0.69      0.95      0.80      7067

    accuracy                           0.95     66486
   macro avg       0.84      0.95      0.89     66486
weighted avg       0.96      0.95      0.95     66486

SUBSAMPLING::::::::::
              precision    recall  f1-score   support

       False       0.99      0.92      0.96     59419
        True       0.59      0.95      0.73      7067

    accuracy                           0.93     66486
   macro avg       0.79      0.94      0.84     66486
weighted a

In [11]:
from sklearn.svm import SVC  # Support Vector Machine classifier

print("ORIGINAL::::::::::")
clf_orig = SVC()
clf_orig.fit(X_orig, y_orig)
pred_orig = clf_orig.predict(X_test)
print(classification_report(y_test, pred_orig))

print("OVERSAMPLING::::::::::")
clf_over = SVC()
clf_over.fit(X_over, y_over)
pred_over = clf_over.predict(X_test)
print(classification_report(y_test, pred_over))

print("SUBSAMPLING::::::::::")
clf_subs = SVC()
clf_subs.fit(X_subs, y_subs)
pred_subs = clf_subs.predict(X_test)
print(classification_report(y_test, pred_subs))

ORIGINAL::::::::::


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       False       0.89      1.00      0.94     59419
        True       0.00      0.00      0.00      7067

    accuracy                           0.89     66486
   macro avg       0.45      0.50      0.47     66486
weighted avg       0.80      0.89      0.84     66486

OVERSAMPLING::::::::::


KeyboardInterrupt: 