In [1]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [2]:
# Dependencies
import numpy as np
import pandas as pd

In [3]:
import tensorflow

In [4]:
df_2018 = pd.read_csv('Resources/CONSOLIDADO_2018.csv', index_col=0)
df_2018.head()

Unnamed: 0,sem_x,codigo_x,dias_pag,dias_lab,sueldo,comp,asist,transp,vales_x,te_dob,...,t_per,t_ded,neto,antig,depto,puesto,label,tiempov,dist,delitos
0,1,7505,7.0,5.0,618.52,132.86,60.0,60.0,120.22,0.0,...,1029.86,21.63,1008.23,468,82,0003A,1,11.188,24.283333,0
1,1,6600,7.0,5.0,618.52,132.86,60.0,60.0,120.22,0.0,...,1029.86,22.6,1007.26,1187,80,0001,1,8.079,27.766667,46
2,1,7515,7.0,5.0,618.52,132.86,60.0,60.0,120.22,0.0,...,1029.86,21.53,1008.33,202,72,0001,0,8.4,21.9,0
3,1,5933,7.0,5.0,618.52,132.86,60.0,60.0,120.22,0.0,...,1029.86,21.45,1008.41,1582,51,0001,1,3.738,10.183333,21
4,1,7170,7.0,5.0,618.52,132.86,60.0,60.0,120.22,0.0,...,1491.6,50.12,1441.48,774,57,0001,1,8.113,20.516667,0


In [5]:
df_2019 = pd.read_csv('Resources/CONSOLIDADO_2019.csv', index_col=0)
df_2019.head()

Unnamed: 0,sem_x,codigo_x,dias_pag,dias_lab,sueldo,comp,asist,transp,vales_x,te_dob,...,t_per,t_ded,neto,antig,depto,puesto,label,tiempov,dist,delitos
0,1,7840,7.0,5.0,1143.52,462.91,60.0,60.0,257.02,367.56,...,3351.59,340.18,3011.41,39,87,0185,0,4.94,6.433333,0
1,1,7505,7.0,5.0,721.0,173.04,60.0,60.0,143.04,0.0,...,1863.08,86.05,1777.03,520,82,0003A,0,11.188,24.283333,0
2,1,6600,7.0,5.0,721.0,30.38,60.0,50.0,120.22,180.25,...,1194.98,21.07,1173.91,1503,80,0002A,1,8.079,27.766667,46
3,1,7685,7.0,5.0,721.0,30.38,60.0,60.0,120.22,0.0,...,1056.69,28.04,1028.65,522,5,0002A,1,9.926,22.383333,0
4,1,7331,7.0,5.0,721.0,30.38,60.0,60.0,120.22,103.0,...,1129.56,24.71,1104.85,338,45,0001,0,8.901,17.116667,59


In [6]:
def labeling(row):
    sigue = "sigue"
    fue = "fue"
    if row == 0:
        return fue
    else:
        return sigue

In [7]:
df_2018["label"] = df_2018["label"].apply(lambda row: labeling(row))

In [8]:
df_2019["label"] = df_2019["label"].apply(lambda row: labeling(row))

In [9]:
df_2018["label"].value_counts()

sigue    7172
fue      2980
Name: label, dtype: int64

In [10]:
df_2019["label"].value_counts()

sigue    5552
fue      2661
Name: label, dtype: int64

In [11]:
df_2018.isna().values.any()

False

In [12]:
df_2019.isna().values.any()

False

In [13]:
df_2018 = df_2018.set_index("codigo_x")

In [14]:
df_2019 = df_2019.set_index("codigo_x")

In [15]:
df_2018 = df_2018[["dias_pag","dias_lab","sueldo","comp","asist","transp","vales_x","te_dob",
          "te_trip","desc_Lab","fest_lab","dominic","perc_grav","obj","aguin","vac","prim_vac",
          "grat_esp","util","zapatos","otras","fonac","alim","infon","falt","enf","ries","antig",
          "tiempov","dist","delitos","label"]]

In [16]:
df_2019 = df_2019[["dias_pag","dias_lab","sueldo","comp","asist","transp","vales_x","te_dob",
          "te_trip","desc_Lab","fest_lab","dominic","perc_grav","obj","aguin","vac","prim_vac",
          "grat_esp","util","zapatos","otras","fonac","alim","infon","falt","enf","ries","antig",
          "tiempov","dist","delitos","label"]]

## Data Pre-Processing

In [17]:
X_train = df_2018.drop("label", axis=1)
X_test = df_2019.drop("label", axis=1)
y_train = df_2018["label"]
y_test = df_2019["label"]

In [18]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [19]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


In [20]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [21]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Create a Deep Learning Model

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [23]:
len(X_train_scaled[0])

31

In [24]:
len(X_test_scaled[0])

31

In [25]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=len(X_train_scaled[0])))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [26]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               3200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 13,502
Trainable params: 13,502
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 10152 samples
Epoch 1/100
10152/10152 - 1s - loss: 0.5520 - accuracy: 0.7277
Epoch 2/100
10152/10152 - 0s - loss: 0.5061 - accuracy: 0.7506
Epoch 3/100
10152/10152 - 0s - loss: 0.4656 - accuracy: 0.7718
Epoch 4/100
10152/10152 - 0s - loss: 0.4440 - accuracy: 0.7883
Epoch 5/100
10152/10152 - 0s - loss: 0.4249 - accuracy: 0.8013
Epoch 6/100
10152/10152 - 0s - loss: 0.4078 - accuracy: 0.8085
Epoch 7/100
10152/10152 - 0s - loss: 0.3944 - accuracy: 0.8133
Epoch 8/100
10152/10152 - 0s - loss: 0.3789 - accuracy: 0.8191
Epoch 9/100
10152/10152 - 0s - loss: 0.3695 - accuracy: 0.8240
Epoch 10/100
10152/10152 - 0s - loss: 0.3568 - accuracy: 0.8296
Epoch 11/100
10152/10152 - 0s - loss: 0.3430 - accuracy: 0.8380
Epoch 12/100
10152/10152 - 0s - loss: 0.3326 - accuracy: 0.8403
Epoch 13/100
10152/10152 - 0s - loss: 0.3305 - accuracy: 0.8420
Epoch 14/100
10152/10152 - 0s - loss: 0.3198 - accuracy: 0.8484
Epoch 15/100
10152/10152 - 0s - loss: 0.3160 - accuracy: 0.8480
Epoch 16/100
10152/10152 -

<tensorflow.python.keras.callbacks.History at 0x279b31944a8>

## Quantify our Trained Model

In [29]:
model_loss, model_accuracy = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(
    f"TRAINING - Loss: {model_loss}, Accuracy: {model_accuracy}")

10152/1 - 0s - loss: 0.0916 - accuracy: 0.9465
TRAINING - Loss: 0.13106449904805378, Accuracy: 0.9465129971504211


In [30]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"TESTING - Loss: {model_loss}, Accuracy: {model_accuracy}")

8213/1 - 0s - loss: 1.5317 - accuracy: 0.7185
TESTING - Loss: 1.8376408112436646, Accuracy: 0.7184950709342957


## Make Predictions

In [31]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [32]:
results = pd.DataFrame({
"prediction": prediction_labels,
"actual": y_test
})
results.head()

Unnamed: 0_level_0,prediction,actual
codigo_x,Unnamed: 1_level_1,Unnamed: 2_level_1
7840,fue,fue
7505,sigue,fue
6600,sigue,sigue
7685,sigue,sigue
7331,sigue,fue


In [33]:
results["diffs"] = results["prediction"] == results["actual"]

In [34]:
results.head()

Unnamed: 0_level_0,prediction,actual,diffs
codigo_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7840,fue,fue,True
7505,sigue,fue,False
6600,sigue,sigue,True
7685,sigue,sigue,True
7331,sigue,fue,False


In [35]:
results["diffs"].value_counts()

True     5901
False    2312
Name: diffs, dtype: int64

In [36]:
diffs = results.loc[results["diffs"] == False]

In [37]:
se_nos_van = diffs.loc[diffs["actual"] == "sigue"]

In [38]:
no_debieron_irse = diffs.loc[diffs["prediction"] == "sigue"]

In [None]:
se_nos_van.to_csv("se_nos_van.csv")

In [None]:
no_debieron_irse.to_csv("se_fueron_porque_quisieron.csv")

In [39]:
se_nos_van.reset_index(inplace=True)

In [40]:
no_debieron_irse.reset_index(inplace=True)

In [41]:
se_nos_van = se_nos_van.groupby("codigo_x").count()

In [42]:
no_debieron_irse = no_debieron_irse.groupby("codigo_x").count()

In [43]:
se_nos_van.count()

prediction    124
actual        124
diffs         124
dtype: int64

In [44]:
no_debieron_irse.count()

prediction    108
actual        108
diffs         108
dtype: int64

In [None]:
weights, biases = model.layers[1].get_weights()