#### Etapa 3 - Análise exploratória dos dados (EDA)

In [81]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
from scipy.stats import mannwhitneyu

warnings.filterwarnings(action='ignore')

In [82]:
df_athletes_activities = pd.read_csv('C:/Users/USER/Desktop/EstudosDados/Projetos/Corrida/physical_inactivity_prediction/scripts_prediction_model/final/st2_df_athletes_activities.csv')
df_athletes_activities_features = df_athletes_activities.loc[:, ~df_athletes_activities.columns.isin(['activity_date', 'athlete_id', 'will_churn'])]
df_athletes_activities_features

Unnamed: 0,total_distance (km),total_time (min),pace (min/km),speed (km/h),weekly_frequency,days_since_last_activity
0,3.61,23.0,6.37,9.42,3,0
1,25.66,127.0,4.95,12.12,2,4
2,25.54,149.0,5.83,10.28,3,2
3,5.84,41.0,7.02,8.55,3,2
4,4.28,32.0,7.48,8.02,4,1
...,...,...,...,...,...,...
885,10.06,45.0,4.47,13.26,3,3
886,5.20,33.0,6.35,9.27,3,2
887,12.10,62.0,5.12,11.71,3,3
888,4.08,22.0,5.39,11.13,2,5


In [83]:
# Teste de normalidade das features com Shapiro-Wilk
for column in df_athletes_activities_features:
    _, p_value = shapiro(df_athletes_activities_features[column].values)
    print(f'Coluna: {column}, p-value: {round(p_value, 5)}')

Coluna: total_distance (km), p-value: 0.0
Coluna: total_time (min), p-value: 0.0
Coluna: pace (min/km), p-value: 0.0
Coluna: speed (km/h), p-value: 0.0
Coluna: weekly_frequency, p-value: 0.0
Coluna: days_since_last_activity, p-value: 0.0


In [84]:
# Análise de correlação das variáveis
corr = df_athletes_activities_features.corr()
corr

Unnamed: 0,total_distance (km),total_time (min),pace (min/km),speed (km/h),weekly_frequency,days_since_last_activity
total_distance (km),1.0,0.825072,-0.122735,0.621289,0.029524,-0.015398
total_time (min),0.825072,1.0,-0.089912,0.192218,0.074927,-0.054444
pace (min/km),-0.122735,-0.089912,1.0,-0.145799,-0.100773,0.013664
speed (km/h),0.621289,0.192218,-0.145799,1.0,0.05274,0.032086
weekly_frequency,0.029524,0.074927,-0.100773,0.05274,1.0,-0.263234
days_since_last_activity,-0.015398,-0.054444,0.013664,0.032086,-0.263234,1.0


In [85]:
# Transformar a matriz de correlação em um DataFrame (formatado de maneira que seja fácil visualizar pares de features)
df_corr = pd.melt(corr, ignore_index=False, var_name='feature2', value_name='correlation')

# Resetar o índice para que 'feature1' seja uma coluna em vez de índice e renomear a coluna de índice para 'feature1'
df_corr = df_corr.reset_index().rename(columns={'index': 'feature1'})

# Remover linhas onde 'feature1' e 'feature2' são iguais (ou seja, a correlação de uma feature com ela mesma)
df_corr.drop(df_corr[(df_corr['feature1'] == df_corr['feature2'])].index, axis=0, inplace=True)

# Remover duplicatas na coluna de correlação, mantendo apenas a primeira ocorrência
df_corr.drop_duplicates(subset='correlation', inplace=True)

# Resetar o índice mais uma vez após remover duplicatas
df_corr.reset_index(drop=True, inplace=True)

df_corr

Unnamed: 0,feature1,feature2,correlation
0,total_time (min),total_distance (km),0.825072
1,pace (min/km),total_distance (km),-0.122735
2,speed (km/h),total_distance (km),0.621289
3,weekly_frequency,total_distance (km),0.029524
4,days_since_last_activity,total_distance (km),-0.015398
5,pace (min/km),total_time (min),-0.089912
6,speed (km/h),total_time (min),0.192218
7,weekly_frequency,total_time (min),0.074927
8,days_since_last_activity,total_time (min),-0.054444
9,speed (km/h),pace (min/km),-0.145799


In [86]:
# Cria um DataFrame vazio para armazenar as médias das correlações absolutas
df_corr_abs_mean = pd.DataFrame()

# Define os índices do DataFrame como as colunas do DataFrame de correlações
df_corr_abs_mean.index = corr.columns

# Itera sobre cada índice (feature) no DataFrame df_corr_abs_mean
for index in df_corr_abs_mean.index:
    # Calcula a média das correlações absolutas para a feature atual
    feature_corr_abs_mean = np.abs(df_corr.loc[(df_corr['feature1'] == index) | (df_corr['feature2'] == index), 'correlation']).mean()
    # Armazena a média calculada no DataFrame df_corr_abs_mean
    df_corr_abs_mean.loc[index, 'correlation'] = feature_corr_abs_mean

# Exibe o DataFrame com as médias das correlações absolutas
df_corr_abs_mean

Unnamed: 0,correlation
total_distance (km),0.322804
total_time (min),0.247315
pace (min/km),0.094577
speed (km/h),0.208826
weekly_frequency,0.10424
days_since_last_activity,0.075765


In [106]:
# Selecionando apenas correlações com valor acima de -0.7 ou 0.7 (Forte correlação)
df_strong_corr = df_corr[(np.abs(df_corr['correlation']) > 0.5)]
df_strong_corr

Unnamed: 0,feature1,feature2,correlation
0,total_time (min),total_distance (km),0.825072
2,speed (km/h),total_distance (km),0.621289


In [107]:
# Cria um conjunto vazio para armazenar features a serem removidas
features_to_remove = set()

# Itera sobre cada linha do DataFrame df_strong_corr
for i, row in df_strong_corr.iterrows():
    # Obtém o nome da primeira feature
    feature1 = row['feature1']
    # Obtém o nome da segunda feature
    feature2 = row['feature2']

    # Compara as correlações das duas features
    if df_corr_abs_mean.loc[feature1, 'correlation'] > df_corr_abs_mean.loc[feature2, 'correlation']:
        # Adiciona feature1 ao conjunto se sua correlação for maior
        features_to_remove.add(feature1)
        
    elif df_corr_abs_mean.loc[feature1, 'correlation'] < df_corr_abs_mean.loc[feature2, 'correlation']:
        # Adiciona feature2 ao conjunto se sua correlação for maior
        features_to_remove.add(feature2)
        
        
# Converte o conjunto para uma lista
features_to_remove = list(features_to_remove)

# Exibe a lista de features a serem removidas
print(features_to_remove)

total_time (min) tem menor correlação do que total_distance (km)
speed (km/h) tem menor correlação do que total_distance (km)
['total_distance (km)']


In [30]:
# Create heatmap
heatmap = go.Heatmap(
    z=corr,
    x=df_athletes_activities_features.columns,
    y=df_athletes_activities_features.columns,
    colorscale='Blues',
    zmin=-1,
    zmax=1
)

# Create figure and add trace
fig = go.Figure(heatmap)

# Update layout
fig.update_layout(title='Mapa de calor da matriz de correlações')

# Display the figure
fig.show()

In [31]:
# Teste de Mann-Whitney U para diferenciação das features em relação ao target
mannwhitneyu_dict = {
    "Feature": [],
    "p-value": []
}


for feature in df_athletes_activities_features.columns:
    sem_lesao = df_athletes_activities.loc[df_athletes_activities['will_churn'] == 0 , feature]
    com_lesao = df_athletes_activities.loc[df_athletes_activities['will_churn'] == 1 , feature]

    U1, p = mannwhitneyu(sem_lesao, com_lesao, method="auto")
    U2, p = mannwhitneyu(com_lesao, sem_lesao, method="auto")

    mannwhitneyu_dict["Feature"].append(feature)
    mannwhitneyu_dict["p-value"].append(round(p, 3))

df_mannwhitneyu = pd.DataFrame(mannwhitneyu_dict).reset_index(drop = True)

df_mannwhitneyu['p-value Significante'] = df_mannwhitneyu['p-value'] < 0.05
df_mannwhitneyu

Unnamed: 0,Feature,p-value,p-value Significante
0,total_distance (km),0.365,False
1,total_time (min),0.051,False
2,pace (min/km),0.008,True
3,speed (km/h),0.287,False
4,weekly_frequency,0.0,True
5,days_since_last_activity,0.0,True
