In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble  import RandomForestClassifier 
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel

In [2]:
data = pd.read_excel('Sumita 300 FINAL PARA CALUCHIN2.xlsx', header=None)

In [3]:
tabla=data.T
tabla=tabla.drop(columns=[0])
tabla[1]=tabla[1].str.strip().fillna(method='ffill')
tabla[2]=tabla[2].str.strip().fillna(method='ffill')
tabla[3]=tabla[3].str.strip().fillna(method='ffill')
tabla['temp'] = tabla[tabla.columns[0:3]].apply(
    lambda x: ';'.join(x.dropna().astype(str)),
    axis=1
)
tabla=tabla.drop(columns=[1,2,3])
tabla.insert(0, 'Fusion', tabla['temp'])
tabla=tabla.drop(columns=['temp'])
tabla=tabla.set_index('Fusion')
tabla=tabla.rename_axis('', axis=0) #supresion del titulo 'Fusion'
tabla=tabla.T
label = tabla[['DNI', 'Moroso']].copy()
tabla.drop(columns=['Alumno','Carrera','CicoActual','Estado_Alumno', 'Moroso']).melt(id_vars='DNI', var_name='atributo', value_name='Valor').head(5)
all_data=tabla.drop(columns=['Alumno','Carrera','CicoActual','Estado_Alumno', 'Moroso'])
all_data=all_data.melt(id_vars='DNI', var_name='atributo', value_name='Valor')
all_data[['Ciclo','Mes','Categoria']]=all_data["atributo"].str.split(";", expand = True) 
all_data=all_data.drop(columns=['atributo'])
all_data=all_data.sort_values(by=['DNI'])
all_data=all_data[['DNI','Ciclo','Mes','Categoria','Valor']]
all_data=pd.pivot_table(all_data, values = 'Valor', index=['DNI','Ciclo','Mes'], columns = 'Categoria', aggfunc='first').reset_index()
all_data.columns.name='' #supresion del titulo 'categoria'
all_data['Duracion Modulo(meses)']=all_data['Duracion Modulo(meses)'].fillna(method='bfill')
all_data['Periodo Academico']=all_data['Periodo Academico'].fillna(method='bfill')
#Supresion de la columna Duracion Modulo(meses) ya que esta mala y tambien de nroDocumento ya que su valor no tiene sentido
all_data=all_data.drop(columns=['Duracion Modulo(meses)', 'NroDocumento'])
#Suppresion de las filas con valores faltantes para FechaPago
all_data=all_data.drop(index=all_data[all_data['FechaPago'].isnull()].index)
data_grupo=all_data.groupby('DNI').agg({'Ciclo': ['nunique'], 'Mes': ['count'],  
                             'FechaMora': ['count'], 'FechaPago': ['count'], 'Periodo Academico': ['nunique']})
data_grupo.columns=data_grupo.columns.droplevel(1)
data_grupo=data_grupo.reset_index()

label['DNI']=label['DNI'].astype(np.int64)
label['Moroso']=label['Moroso'].astype(np.bool_)
all_data = data_grupo.merge(label, left_on='DNI', right_on='DNI', how='inner').copy()

target_number=all_data['Moroso']
all_data=all_data.drop(columns=['DNI'])
all_data=all_data.drop(columns=['Moroso'])

In [6]:
def run_classifier(_depth):
    X_train, X_test, y_train, y_test = train_test_split(all_data.values, target_number, test_size=0.20, random_state=0, stratify=target_number)
    w=compute_sample_weight(class_weight='balanced', y=y_train) 
    imputer = Imputer(strategy='mean', missing_values=-1) 

    #fill of NaN
    imputer.fit(X_train)
    X_train_imputed = imputer.transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    classifier = RandomForestClassifier(n_estimators=1,max_depth=_depth, class_weight="balanced_subsample", random_state=0)
    classifier.fit(X_train_imputed, y_train, sample_weight=w)
    y_train_predicted = classifier.predict(X_train_imputed)
    y_test_predicted = classifier.predict(X_test_imputed)

 
    print('TRAIN')
    print('f1_score:      ',metrics.f1_score(y_train, y_train_predicted, average='weighted'), '\nroc_auc_score: ', metrics.roc_auc_score(y_train, y_train_predicted, average='weighted'), '\n')
    print(classification_report(y_train, y_train_predicted))
    print('TEST')
    print('f1_score:      ',metrics.f1_score(y_test, y_test_predicted, average='weighted'), '\nroc_auc_score: ', metrics.roc_auc_score(y_test, y_test_predicted, average='weighted'), '\n')
    print(classification_report(y_test, y_test_predicted))
    

In [7]:
%%time
run_classifier(1)



TRAIN
f1_score:       0.9657224958949098 
roc_auc_score:  0.9411764705882353 

              precision    recall  f1-score   support

       False       1.00      0.88      0.94        68
        True       0.96      1.00      0.98       170

   micro avg       0.97      0.97      0.97       238
   macro avg       0.98      0.94      0.96       238
weighted avg       0.97      0.97      0.97       238

TEST
f1_score:       0.9484233417905037 
roc_auc_score:  0.9117647058823529 

              precision    recall  f1-score   support

       False       1.00      0.82      0.90        17
        True       0.93      1.00      0.97        43

   micro avg       0.95      0.95      0.95        60
   macro avg       0.97      0.91      0.93        60
weighted avg       0.95      0.95      0.95        60

CPU times: user 213 ms, sys: 53.6 ms, total: 267 ms
Wall time: 3.18 s
