# Imports

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

# ETL

In [5]:
df_train = pd.read_csv('./datasets/z_train.csv')
df_target = pd.read_csv('./datasets/z_test.csv')

In [None]:
# df_train.head(5)

In [6]:
df_train.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [None]:
# Elección de variables para el modelo
# model_cols = ['enrollee_id','city','city_development_index','gender','relevent_experience','enrolled_university','education_level','last_new_job','training_hours','target']
# model_cols_t = ['enrollee_id','city','city_development_index','gender','relevent_experience','enrolled_university','education_level','last_new_job','training_hours']
# df_train = df_train[model_cols]
# df_target = df_target[model_cols_t]

# Entranamiento de modelo

## idenficación de variables categoricas typadas como númericas

In [10]:
# idenficación de variables categoricas identificadas como númericas
df_train.enrollee_id = df_train.enrollee_id.astype('category')
df_train.target = df_train.target.astype('category')

# replicamos en el set objetivo
df_target.enrollee_id = df_target.enrollee_id.astype('category')

In [7]:
def heat_map(df):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(15,8))
    mask = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(ax=ax, data=corr, annot=True, cmap='Pastel1_r', mask=mask)
    plt.show()

In [12]:
# heat_map(df_train)

## Depuración en set de entranamiento de datos que no existen en el set objetivo

In [None]:
# depuración de variable city
df_train = df_train.copy()
# Eliminamos del set de entrenamiento las ciudades que no existen en el set objetivo
print(df_train.shape)
drop_list = df_target.city.value_counts().index.tolist()
df_train.drop(df_train[~df_train.city.isin(drop_list)].index, inplace = True)
print(df_train.shape)


(15326, 14)
(15242, 14)


## Balanceo de dataset de entrenamiento

In [None]:
# Dividimos el dataset 2 para cada tipo de datos
total_rows_0, total_rows_1 = df_train.target.value_counts()
df_class_0 = df_train[df_train.target == 0]
df_class_1 = df_train[df_train.target == 1]
print(df_class_0.target.count())
print(df_class_1.target.count())

11435
3807


In [None]:
# # creamos un dataset balanceado utilizando undersample (eliminamos registros al set de 0s)
# df_sample_0 = resample(df_class_0, replace=True, n_samples=total_rows_1, random_state=15)
# print('resample minority: ', df_sample_0.target.count())
# df_train = pd.concat([df_sample_0, df_class_1], axis=0)
# print('set balanced: ', df_train.target.count())

In [None]:
# creamos un dataset balanceado utilizando oversample (agregamos registros al set de 1s)
df_sample_1 = resample(df_class_1, replace=True, n_samples=total_rows_0, random_state=15)
print('resample minority: ', df_sample_1.target.count())
df_train = pd.concat([df_sample_1, df_class_0], axis=0)
print('set balanced: ', df_train.target.count())

resample minority:  11435
set balanced:  22870


## tratamiento de nullos y vacios

In [None]:
def transform_nulls(df):
    df = df.copy()
    df.gender = df.gender.fillna('Other')
    df.enrolled_university = df.enrolled_university.fillna('no_enrollment')
    df.education_level = df.education_level.fillna('Undefined')
    df.major_discipline = df.major_discipline.fillna('Other')
    df.experience = df.experience.fillna('Other')
    df.company_size = df.company_size.fillna('Other')
    df.company_type = df.company_type.fillna('Other')
    df.last_new_job = df.last_new_job.fillna('never')
    return df

df_train = transform_nulls(df_train)
df_target = transform_nulls(df_target)
    

## Eliminación de outliers

In [None]:
df_train.describe()

Unnamed: 0,city_development_index,training_hours
count,22870.0,22870.0
mean,0.806105,64.803061
std,0.133983,59.474954
min,0.448,1.0
25%,0.624,23.0
50%,0.893,47.0
75%,0.92,88.0
max,0.949,336.0


In [None]:
# identificamos las columnas númericas y las categoricas
n_cols = ['training_hours']

# Eliminamos los outliers de las columnas númericas
z_scores = stats.zscore(df_train[n_cols])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df_train = df_train[filtered_entries]
df_train.describe()

NameError: name 'df_sample' is not defined

## Escalamiento de variable númericas

In [None]:
# Escalamos las variables númericas
scale = StandardScaler()
df_train['training_hours'] = scale.fit_transform(df_train.training_hours.values.reshape(-1,1))

# Escalamos las variables del set objetivo 
df_target['training_hours'] = scale.fit_transform(df_target.training_hours.values.reshape(-1,1))


In [None]:
# Eliminamos del set de entrenamiento las columnas que no existen en el set objetivo
print(df_train.shape)
drop_list = df_target.city.value_counts().index.tolist()
df_train.drop(df_train[~df_train.city.isin(drop_list)].index, inplace = True)
print(df_train.shape)

(7614, 14)
(7614, 14)


In [None]:
#****************************************************************
#****************************************************************
# revisemos como se estan normalizando los datos
df_train.to_csv('./outputs/df_train_temp.csv', index=False)
df_target.to_csv('./outputs/df_target_temp.csv', index=False)
#****************************************************************
#****************************************************************

In [None]:
df_train.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [None]:
# Generación de variables dummy
dummy_cols   = ['city','gender','relevent_experience','enrolled_university','education_level','major_discipline','experience','company_size','company_type','last_new_job']
df_dummy = pd.get_dummies(data=df_train, columns=dummy_cols)
df_dummy_t = pd.get_dummies(data=df_target, columns=dummy_cols)
# df_dummy.head()

In [None]:
# igualamos las columnas del set de entrenamiento y objetivo
target_cols = df_dummy_t.columns.tolist()
df_dummy = df_dummy[target_cols]
df_dummy = pd.concat([df_dummy,df_train.target.reindex(df_dummy.index)], axis=1)

KeyError: "['city_city_8', 'city_city_121'] not in index"

## Carga de datos

In [None]:
# creamos sets de entrenamiento
X = df_dummy.drop(['enrollee_id','target'], axis=1).to_numpy()
y = df_dummy.target.to_numpy()
# creamos set objetivo
X_t = df_dummy_t.drop(['enrollee_id'], axis=1).to_numpy()

# generamos los sets de entreamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

## Modelo de Regresión Logistica

In [None]:
# Entrenamos el modelo de Regresión Logistica
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)
logreg.score(X_train, y_train)

0.7747595102754701

In [None]:
# Hacemos las predicciones con el set de pruebas
y_pred = logreg.predict(X_test)
print(logreg.score(X_test, y_test))
confusion_matrix(y_test, y_pred)

0.7586357673808483


array([[1729,  558],
       [ 546, 1741]], dtype=int64)

In [None]:
# Revisamos el reporte de clasificación
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.76      0.76      2287
         1.0       0.76      0.76      0.76      2287

    accuracy                           0.76      4574
   macro avg       0.76      0.76      0.76      4574
weighted avg       0.76      0.76      0.76      4574



# Predicciones del set objetivo

## Carga de datos

In [None]:
# generamos las predicciones del set objetivo
y_pred_t = logreg.predict(X_t)
# Agregamos las etiquetas de predicción al df
df_target['target'] = y_pred_t
# creamos el df de submission
df_submission = df_target[['enrollee_id','target']]
df_submission.head(10)

Unnamed: 0,enrollee_id,target
0,23603,1.0
1,22499,0.0
2,10465,1.0
3,8293,0.0
4,4246,0.0
5,29306,0.0
6,23118,1.0
7,32832,0.0
8,13380,0.0
9,16102,1.0


In [None]:
df_submission.target.value_counts()

0.0    2434
1.0    1398
Name: target, dtype: int64

In [None]:
# creamos el archivo csv para submission en el portal kaggle
df_submission.to_csv('./outputs/submission_21.csv', index=False)