In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Data

In [40]:
path1 = '/home/mcalmeida/machine-learning/projects/neoway-case/data/conexoes_espec.csv'
path2 = '/home/mcalmeida/machine-learning/projects/neoway-case/data/individuos_espec.csv'

In [41]:
df_conexoes = pd.read_csv(path1,sep=';')
df_individuos = pd.read_csv(path2,sep=';')

In [5]:
df_conexoes.head()

Unnamed: 0,V1,V2,grau,proximidade,prob_V1_V2
0,1,2,trabalho,visita_frequente,0.589462
1,1,3,trabalho,visita_rara,0.708465
2,2,4,trabalho,visita_casual,
3,2,5,trabalho,visita_rara,0.638842
4,3,6,amigos,mora_junto,


In [6]:
df_individuos.head()

Unnamed: 0,name,idade,estado_civil,qt_filhos,estuda,trabalha,pratica_esportes,transporte_mais_utilizado,IMC
0,1,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956
1,2,24.0,casado,0.0,0.0,0.0,1.0,publico,25.37872
2,3,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393
3,4,50.0,casado,1.0,1.0,1.0,0.0,publico,26.732053
4,5,30.0,solteiro,2.0,1.0,0.0,1.0,publico,15.295668


In [7]:
V1 = df_individuos.copy()
V1.columns = V1.columns + '_1'

In [8]:
V2 = df_individuos.copy()
V2.columns = V2.columns + '_2'

In [9]:
df_temp1 = df_conexoes.merge(V1, left_on='V1', right_on='name_1')

In [10]:
df_final = df_temp1.merge(V2, left_on='V2', right_on='name_2')

In [11]:
df_known = df_final[~df_final['prob_V1_V2'].isna()]

In [12]:
df_unknown = df_final[df_final['prob_V1_V2'].isna()]

In [13]:
(df_known.corr()).style.background_gradient(cmap='coolwarm')

Unnamed: 0,V1,V2,prob_V1_V2,name_1,idade_1,qt_filhos_1,estuda_1,trabalha_1,pratica_esportes_1,IMC_1,name_2,idade_2,qt_filhos_2,estuda_2,trabalha_2,pratica_esportes_2,IMC_2
V1,1.0,1.0,-0.001129,1.0,-0.002027,-4.3e-05,1.7e-05,0.002362,-0.001017,-0.002404,1.0,-0.000515,0.000614,-0.001129,6.7e-05,-0.001703,-0.000498
V2,1.0,1.0,-0.001129,1.0,-0.002027,-4.3e-05,1.7e-05,0.002362,-0.001017,-0.002404,1.0,-0.000515,0.000614,-0.001129,6.7e-05,-0.001703,-0.000498
prob_V1_V2,-0.001129,-0.001129,1.0,-0.001129,-0.046308,-0.023822,0.089574,-0.371803,0.350341,0.00179,-0.001129,-0.0197,-0.002663,0.002761,-0.002882,-0.002561,0.033103
name_1,1.0,1.0,-0.001129,1.0,-0.002027,-4.3e-05,1.7e-05,0.002362,-0.001017,-0.002404,1.0,-0.000515,0.000614,-0.001129,6.7e-05,-0.001703,-0.000498
idade_1,-0.002027,-0.002027,-0.046308,-0.002027,1.0,0.115732,-0.141664,0.139151,0.000489,-0.000468,-0.002027,0.00042,0.002158,0.001408,-0.000815,-0.001751,0.000263
qt_filhos_1,-4.3e-05,-4.3e-05,-0.023822,-4.3e-05,0.115732,1.0,-0.081414,0.08214,0.000538,-0.002991,-4.3e-05,-4.5e-05,0.000126,-0.001074,0.002128,0.000351,0.003115
estuda_1,1.7e-05,1.7e-05,0.089574,1.7e-05,-0.141664,-0.081414,1.0,-0.098126,-0.000666,0.000183,1.7e-05,0.002261,0.000604,-0.002774,-0.001203,0.001545,-0.002827
trabalha_1,0.002362,0.002362,-0.371803,0.002362,0.139151,0.08214,-0.098126,1.0,0.002481,-0.004831,0.002362,-0.00026,-0.001463,0.002003,-0.002042,0.000225,0.001751
pratica_esportes_1,-0.001017,-0.001017,0.350341,-0.001017,0.000489,0.000538,-0.000666,0.002481,1.0,0.000291,-0.001017,-0.002108,-0.001098,0.000177,-0.001132,0.000194,0.000111
IMC_1,-0.002404,-0.002404,0.00179,-0.002404,-0.000468,-0.002991,0.000183,-0.004831,0.000291,1.0,-0.002404,0.000608,-0.001869,0.000825,-0.000762,-0.000852,0.001191


In [15]:
df_unknown.drop('prob_V1_V2',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unknown.drop('prob_V1_V2',axis=1,inplace=True)


# Input Missing Values

In [16]:
df_temp = df_known.drop(['V1','V2','name_1','name_2'], axis=1)

In [17]:
df_temp.head()

Unnamed: 0,grau,proximidade,prob_V1_V2,idade_1,estado_civil_1,qt_filhos_1,estuda_1,trabalha_1,pratica_esportes_1,transporte_mais_utilizado_1,IMC_1,idade_2,estado_civil_2,qt_filhos_2,estuda_2,trabalha_2,pratica_esportes_2,transporte_mais_utilizado_2,IMC_2
0,trabalho,visita_frequente,0.589462,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956,24.0,casado,0.0,0.0,0.0,1.0,publico,25.37872
1,trabalho,visita_rara,0.708465,44.0,divorciado,1.0,1.0,0.0,1.0,publico,22.200956,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393
3,trabalho,visita_rara,0.638842,24.0,casado,0.0,0.0,0.0,1.0,publico,25.37872,30.0,solteiro,2.0,1.0,0.0,1.0,publico,15.295668
5,familia,visita_casual,0.709608,35.0,solteiro,1.0,0.0,0.0,1.0,particular,19.952393,55.0,solteiro,1.0,1.0,1.0,1.0,particular,
7,amigos,visita_casual,0.465209,50.0,casado,1.0,1.0,1.0,0.0,publico,26.732053,42.0,divorciado,1.0,0.0,1.0,,publico,40.793339


In [18]:
# Shuffle the data
np.random.seed(42)
df_shuffled = df_temp.sample(frac=1)

In [19]:
df_shuffled.head()

Unnamed: 0,grau,proximidade,prob_V1_V2,idade_1,estado_civil_1,qt_filhos_1,estuda_1,trabalha_1,pratica_esportes_1,transporte_mais_utilizado_1,IMC_1,idade_2,estado_civil_2,qt_filhos_2,estuda_2,trabalha_2,pratica_esportes_2,transporte_mais_utilizado_2,IMC_2
208657,familia,visita_rara,0.32492,30.0,divorciado,0.0,0.0,0.0,0.0,publico,20.382146,19.0,solteiro,3.0,0.0,1.0,1.0,particular,22.024349
399816,amigos,visita_rara,0.663947,48.0,casado,1.0,0.0,0.0,1.0,particular,19.142755,36.0,divorciado,1.0,0.0,1.0,1.0,publico,
280646,familia,visita_frequente,0.394388,37.0,viuvo,2.0,0.0,0.0,,particular,23.301253,30.0,solteiro,4.0,0.0,1.0,1.0,publico,22.695043
265789,trabalho,visita_rara,0.511199,,divorciado,0.0,0.0,0.0,0.0,publico,24.696776,23.0,solteiro,0.0,1.0,1.0,1.0,taxi,29.660348
817534,trabalho,visita_casual,0.248089,37.0,casado,1.0,0.0,1.0,0.0,taxi,17.2658,27.0,solteiro,1.0,1.0,1.0,1.0,publico,20.440223


In [20]:
# Split into X & y
X_temp = df_shuffled.drop("prob_V1_V2", axis=1)
y_temp = df_shuffled["prob_V1_V2"]

In [21]:
np.random.seed(42)
# Split the data into train, validation & test sets
train_split = round(0.85 * len(df_shuffled)) # 85% of data
X, y = X_temp[:train_split], y_temp[:train_split]
X_test, y_test = X_temp[train_split:], y_temp[train_split:]

# Data cleaning

In [22]:
categorical_features = [
 'grau','proximidade',
 'estado_civil_1','transporte_mais_utilizado_1',
 'estado_civil_2','transporte_mais_utilizado_2'
]

categorical_imputer = SimpleImputer(strategy="most_frequent")

In [23]:
filhos_features = [
    'qt_filhos_1','qt_filhos_2',
]

filhos_imputer = SimpleImputer(strategy="most_frequent")

In [24]:
bool_features = [
 'estuda_1','trabalha_1','estuda_2', 'trabalha_2'
]

bool_imputer = SimpleImputer(strategy="most_frequent")

In [25]:
esporte_features = [
    'pratica_esportes_1','pratica_esportes_2',
]

# 0: não
# 1: sim
# 2: não informado
esporte_imputer = SimpleImputer(strategy="constant", fill_value=2.0)

In [26]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 424999 entries, 208657 to 656672
Data columns (total 18 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   grau                         424999 non-null  object 
 1   proximidade                  424999 non-null  object 
 2   idade_1                      384071 non-null  float64
 3   estado_civil_1               403786 non-null  object 
 4   qt_filhos_1                  412660 non-null  float64
 5   estuda_1                     407959 non-null  float64
 6   trabalha_1                   422322 non-null  float64
 7   pratica_esportes_1           361547 non-null  float64
 8   transporte_mais_utilizado_1  406505 non-null  object 
 9   IMC_1                        376479 non-null  float64
 10  idade_2                      384483 non-null  float64
 11  estado_civil_2               403687 non-null  object 
 12  qt_filhos_2                  412799 non-null  float64

In [27]:
numeric_features = ['idade_1','idade_2','IMC_1','IMC_2']
numeric_imputer = SimpleImputer(strategy="median")

In [28]:
# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("categorical_imputer", categorical_imputer, categorical_features),
    ("esporte_imputer", esporte_imputer, esporte_features),
    ("numeric_imputer", numeric_imputer, numeric_features),
    ("filhos_imputer", filhos_imputer, filhos_features),
    ("bool_imputer", bool_imputer, bool_features)
])

In [29]:
columns = categorical_features + esporte_features + numeric_features + filhos_features + bool_features

In [30]:
# Fill train and test values separately
filled_X = imputer.fit_transform(X)
filled_X_test = imputer.transform(X_test)

# Check filled X
filled_X

array([['familia', 'visita_rara', 'divorciado', ..., 0.0, 0.0, 1.0],
       ['amigos', 'visita_rara', 'casado', ..., 0.0, 0.0, 1.0],
       ['familia', 'visita_frequente', 'viuvo', ..., 0.0, 0.0, 1.0],
       ...,
       ['trabalho', 'visita_casual', 'solteiro', ..., 1.0, 0.0, 0.0],
       ['trabalho', 'visita_frequente', 'solteiro', ..., 1.0, 1.0, 1.0],
       ['amigos', 'visita_rara', 'solteiro', ..., 0.0, 0.0, 0.0]],
      dtype=object)

In [31]:
filled_unkown = imputer.transform(df_unknown)

In [32]:
filled_unkown

array([['trabalho', 'visita_casual', 'casado', ..., 0.0, 1.0, 1.0],
       ['amigos', 'mora_junto', 'solteiro', ..., 0.0, 0.0, 1.0],
       ['familia', 'mora_junto', 'casado', ..., 1.0, 1.0, 0.0],
       ...,
       ['trabalho', 'visita_rara', 'casado', ..., 1.0, 0.0, 1.0],
       ['trabalho', 'visita_rara', 'casado', ..., 1.0, 0.0, 1.0],
       ['familia', 'visita_rara', 'solteiro', ..., 0.0, 0.0, 0.0]],
      dtype=object)

In [33]:
# Get our transformed data array's back into DataFrame's
X_filled = pd.DataFrame(filled_X, 
                                      columns=columns)

X_filled_test = pd.DataFrame(filled_X_test, 
                                     columns=columns)

unknown_filled = pd.DataFrame(filled_unkown, 
                                     columns=columns)                                    

# Check missing data in training set
unknown_filled.isna().sum()

grau                           0
proximidade                    0
estado_civil_1                 0
transporte_mais_utilizado_1    0
estado_civil_2                 0
transporte_mais_utilizado_2    0
pratica_esportes_1             0
pratica_esportes_2             0
idade_1                        0
idade_2                        0
IMC_1                          0
IMC_2                          0
qt_filhos_1                    0
qt_filhos_2                    0
estuda_1                       0
trabalha_1                     0
estuda_2                       0
trabalha_2                     0
dtype: int64

In [34]:
X_filled.head()

Unnamed: 0,grau,proximidade,estado_civil_1,transporte_mais_utilizado_1,estado_civil_2,transporte_mais_utilizado_2,pratica_esportes_1,pratica_esportes_2,idade_1,idade_2,IMC_1,IMC_2,qt_filhos_1,qt_filhos_2,estuda_1,trabalha_1,estuda_2,trabalha_2
0,familia,visita_rara,divorciado,publico,solteiro,particular,0.0,1.0,30.0,19.0,20.382146,22.024349,0.0,3.0,0.0,0.0,0.0,1.0
1,amigos,visita_rara,casado,particular,divorciado,publico,1.0,1.0,48.0,36.0,19.142755,21.423458,1.0,1.0,0.0,0.0,0.0,1.0
2,familia,visita_frequente,viuvo,particular,solteiro,publico,2.0,1.0,37.0,30.0,23.301253,22.695043,2.0,4.0,0.0,0.0,0.0,1.0
3,trabalho,visita_rara,divorciado,publico,solteiro,taxi,0.0,1.0,29.0,23.0,24.696776,29.660348,0.0,0.0,0.0,0.0,1.0,1.0
4,trabalho,visita_casual,casado,taxi,solteiro,publico,0.0,1.0,37.0,27.0,17.2658,20.440223,1.0,1.0,0.0,1.0,1.0,1.0


In [35]:
# Now let's one hot encode the features with the same code as before 
cat_features = categorical_features

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 cat_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X = transformer.fit_transform(X_filled)
transformed_X_test = transformer.transform(X_filled_test)
transformed_unknown = transformer.transform(unknown_filled)

In [36]:
# Load model
import pickle

# Load a saved pickle model
model = pickle.load(open("rf_base_final.pkl", "rb"))
df_unknown['prob_predicted'] = model.predict(transformed_unknown)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unknown['prob_predicted'] = model.predict(transformed_unknown)


In [37]:
df_unknown.head()

Unnamed: 0,V1,V2,grau,proximidade,name_1,idade_1,estado_civil_1,qt_filhos_1,estuda_1,trabalha_1,...,name_2,idade_2,estado_civil_2,qt_filhos_2,estuda_2,trabalha_2,pratica_esportes_2,transporte_mais_utilizado_2,IMC_2,prob_predicted
2,2,4,trabalho,visita_casual,2,24.0,casado,0.0,0.0,0.0,...,4,50.0,casado,1.0,1.0,1.0,0.0,publico,26.732053,0.638676
4,3,6,amigos,mora_junto,3,35.0,solteiro,1.0,0.0,0.0,...,6,20.0,,1.0,0.0,1.0,0.0,publico,20.412942,0.319707
6,4,8,familia,mora_junto,4,50.0,casado,1.0,1.0,1.0,...,8,50.0,divorciado,0.0,1.0,0.0,,publico,21.445628,0.213376
9,5,11,trabalho,visita_casual,5,30.0,solteiro,2.0,1.0,0.0,...,11,21.0,solteiro,0.0,0.0,0.0,0.0,publico,24.37577,0.667086
10,6,12,familia,visita_casual,6,20.0,,1.0,0.0,1.0,...,12,42.0,viuvo,1.0,0.0,1.0,0.0,publico,,0.498252
