In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

# Extraction

In [2]:
df_train = pd.read_csv('./datasets/z_train.csv')
df_pred = pd.read_csv('./datasets/z_test.csv')

# Analysis

In [3]:
def analysis_df(df):
    df = df.copy()
    print(df.info())
    print('-------------------------------------')
    print(df.isna().sum())
    print('-------------------------------------')
    print(df.shape)
    print('-------------------------------------')
    

In [4]:
# analizamos el set de entranamiento
analysis_df(df_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11725 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15024 non-null  object 
 6   education_level         14957 non-null  object 
 7   major_discipline        13089 non-null  object 
 8   experience              15276 non-null  object 
 9   company_size            10592 non-null  object 
 10  company_type            10435 non-null  object 
 11  last_new_job            14987 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [5]:
# analizamos el set de predicción
analysis_df(df_pred)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3832 entries, 0 to 3831
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             3832 non-null   int64  
 1   city                    3832 non-null   object 
 2   city_development_index  3832 non-null   float64
 3   gender                  2925 non-null   object 
 4   relevent_experience     3832 non-null   object 
 5   enrolled_university     3748 non-null   object 
 6   education_level         3741 non-null   object 
 7   major_discipline        3256 non-null   object 
 8   experience              3817 non-null   object 
 9   company_size            2628 non-null   object 
 10  company_type            2583 non-null   object 
 11  last_new_job            3748 non-null   object 
 12  training_hours          3832 non-null   int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 389.3+ KB
None
----------------------------

# Transformacion

In [6]:
def transform_df(df):
    df = df.copy()
    df.dropna(inplace=True)
    df.enrolled_university = df.enrolled_university.fillna('no_enrollment')
    df.education_level = df.education_level.fillna('undefinded')
    df.last_new_job = df.last_new_job.fillna('undefinded')
    df.dropna(subset=['experience'], inplace=True)
    return df
    

# Carga set de entranamiento

In [7]:
df_t1 = transform_df(df_train)
df_t1.head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
6,402,city_46,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,<10,Pvt Ltd,>4,18,1.0
7,27107,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,50-99,Pvt Ltd,1,46,1.0


In [8]:
# filtramos las variables relevantes para el modelo
df_t2 = df_t1[['enrollee_id','gender','city_development_index','relevent_experience','enrolled_university','education_level','last_new_job','training_hours','target']]
df_t2.head()

Unnamed: 0,enrollee_id,gender,city_development_index,relevent_experience,enrolled_university,education_level,last_new_job,training_hours,target
1,29725,Male,0.776,No relevent experience,no_enrollment,Graduate,>4,47,0.0
6,402,Male,0.762,Has relevent experience,no_enrollment,Graduate,>4,18,1.0
7,27107,Male,0.92,Has relevent experience,no_enrollment,Graduate,1,46,1.0
10,23853,Male,0.92,Has relevent experience,no_enrollment,Graduate,1,108,0.0
11,25619,Male,0.913,Has relevent experience,no_enrollment,Graduate,3,23,0.0


In [9]:
df_t2_dummy = pd.get_dummies(data=df_t2, columns=['gender','relevent_experience','enrolled_university','education_level','last_new_job'])
df_t2_dummy.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7182 entries, 1 to 15323
Data columns (total 21 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   enrollee_id                                  7182 non-null   int64  
 1   city_development_index                       7182 non-null   float64
 2   training_hours                               7182 non-null   int64  
 3   target                                       7182 non-null   float64
 4   gender_Female                                7182 non-null   uint8  
 5   gender_Male                                  7182 non-null   uint8  
 6   gender_Other                                 7182 non-null   uint8  
 7   relevent_experience_Has relevent experience  7182 non-null   uint8  
 8   relevent_experience_No relevent experience   7182 non-null   uint8  
 9   enrolled_university_Full time course         7182 non-null   uint8  
 10 

In [10]:
# Generamos los sets a entrenar
X = df_t2_dummy.drop(['enrollee_id','target'], axis=1).to_numpy()
y = df_t2_dummy.target.to_numpy()

In [11]:
# generamos los sets de entreamiento y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Modelo de Regresión Logistica

In [12]:
# imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [13]:
# Logististic Regression con el set completo de datos
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X, y)
logreg.score(X, y)

0.8430799220272904

In [14]:
# Logististic Regression con el set de entrenamiento
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)
logreg.score(X_train, y_train)

0.842993907745866

In [15]:
# Hacemos las predicciones
y_test_pred = logreg.predict(X_test)
confusion_matrix(y_test, y_test_pred)

array([[1175,   34],
       [ 188,   40]], dtype=int64)

# Carga set de predicción

In [16]:
df_p1 = df_pred
df_p1.head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,23603,city_160,0.92,Male,No relevent experience,Full time course,Graduate,STEM,5,,,1,78
1,22499,city_45,0.89,,Has relevent experience,Full time course,High School,,6,50-99,Pvt Ltd,1,36
2,10465,city_149,0.689,Male,No relevent experience,no_enrollment,Graduate,STEM,5,,,never,34


In [17]:
# filtramos las variables relevantes para el modelo
df_p2 = df_p1[['enrollee_id','gender','city_development_index','relevent_experience','enrolled_university','education_level','last_new_job','training_hours']]
df_p2.head(3)

Unnamed: 0,enrollee_id,gender,city_development_index,relevent_experience,enrolled_university,education_level,last_new_job,training_hours
0,23603,Male,0.92,No relevent experience,Full time course,Graduate,1,78
1,22499,,0.89,Has relevent experience,Full time course,High School,1,36
2,10465,Male,0.689,No relevent experience,no_enrollment,Graduate,never,34


In [18]:
df_p2_dummy = pd.get_dummies(data=df_p2, columns=['gender','relevent_experience','enrolled_university','education_level','last_new_job'])
df_p2_dummy.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3832 entries, 0 to 3831
Data columns (total 22 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   enrollee_id                                  3832 non-null   int64  
 1   city_development_index                       3832 non-null   float64
 2   training_hours                               3832 non-null   int64  
 3   gender_Female                                3832 non-null   uint8  
 4   gender_Male                                  3832 non-null   uint8  
 5   gender_Other                                 3832 non-null   uint8  
 6   relevent_experience_Has relevent experience  3832 non-null   uint8  
 7   relevent_experience_No relevent experience   3832 non-null   uint8  
 8   enrolled_university_Full time course         3832 non-null   uint8  
 9   enrolled_university_Part time course         3832 non-null   uint8  
 10  

In [19]:
# filtramos las columnas que existen en el set de predicción pero no en el de entrenamiento
cols = df_p2_dummy.columns.values.tolist()
cols_to_drop = ['education_level_High School','education_level_Primary School']
cols_select = [col for col in df_p2_dummy if col not in cols_to_drop]
df_p3 = df_p2_dummy[cols_select]
df_p3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3832 entries, 0 to 3831
Data columns (total 20 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   enrollee_id                                  3832 non-null   int64  
 1   city_development_index                       3832 non-null   float64
 2   training_hours                               3832 non-null   int64  
 3   gender_Female                                3832 non-null   uint8  
 4   gender_Male                                  3832 non-null   uint8  
 5   gender_Other                                 3832 non-null   uint8  
 6   relevent_experience_Has relevent experience  3832 non-null   uint8  
 7   relevent_experience_No relevent experience   3832 non-null   uint8  
 8   enrolled_university_Full time course         3832 non-null   uint8  
 9   enrolled_university_Part time course         3832 non-null   uint8  
 10  

In [20]:
# Generamos el set para predecir
X_p = df_p3.drop(['enrollee_id'], axis=1).to_numpy()

## Modelo de Regresión Logistica

In [21]:
# Hacemos las predicciones con el dataset para submission
target_predictions = logreg.predict(X_p)
target_predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [22]:
# Agregamos las etiquetas de predicción al df
df_p1['target'] = target_predictions
df_p1.target.unique()

array([0., 1.])

In [23]:
df_submission = df_p1[['enrollee_id','target']]
df_submission.head(10)

Unnamed: 0,enrollee_id,target
0,23603,0.0
1,22499,0.0
2,10465,0.0
3,8293,0.0
4,4246,0.0
5,29306,0.0
6,23118,1.0
7,32832,0.0
8,13380,0.0
9,16102,1.0


In [None]:
df_submission.target.value_counts()

In [24]:
df_submission.to_csv('./outputs/submission_22.csv', index=False)