In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# Carregar o DataFrame
df = pd.read_excel('C:/Users/Patyc/OneDrive/Desktop/Dissertation/Data/Skin_clean and treated/Skin_clean.xlsx')

## Converter a data para datetime e ordenar de forma crescente
df['Screening_date'] = pd.to_datetime(df['Screening_date'])
df = df.sort_values(by='Screening_date')

In [3]:
# 2. Criar uma coluna para representar a semana e o ano
df['Week'] = df['Screening_date'].dt.strftime('%Y-%U')

# 3. Contar o número de screenings por semana
weekly_counts = df.groupby('Week').size().reset_index(name='total_screenings_per_week')

weekly_counts


Unnamed: 0,Week,total_screenings_per_week
0,2022-09,92
1,2022-11,62
2,2022-12,90
3,2022-14,139
4,2022-16,173
...,...,...
70,2024-11,168
71,2024-12,198
72,2024-13,242
73,2024-15,186


In [4]:
# Carregar o DataFrame
df = pd.read_excel('C:/Users/Patyc/OneDrive/Desktop/Dissertation/Data/Skin_clean and treated/Skin_clean.xlsx')

## Converter a data para datetime e ordenar de forma crescente
df['Screening_date'] = pd.to_datetime(df['Screening_date'])
df = df.sort_values(by='Screening_date')

# Iterar sobre cada data única no DataFrame
for date in df['Screening_date'].unique():
    # Filtrar o DataFrame para a data atual
    date_mask = df['Screening_date'] == date
    df_date = df[date_mask]
    
    # Preencher valores ausentes
    for column in df_date.columns:
        if column != 'Screening_date':  # Ignorar a coluna de data
            if pd.api.types.is_numeric_dtype(df_date[column]):
                # Verifica se a coluna tem valores não NaN antes de calcular a mediana
                if df_date[column].notna().any():
                    df.loc[date_mask, column] = df_date[column].fillna(df_date[column].median())
                else:
                    # Se todos os valores são NaN, preenche com a mediana global da coluna
                    df.loc[date_mask, column] = df_date[column].fillna(df[column].median())
            else:
                # Verifica se a coluna tem valores não NaN antes de calcular a moda
                if df_date[column].notna().any():
                    df.loc[date_mask, column] = df_date[column].fillna(df_date[column].mode().iloc[0])
                else:
                    # Se todos os valores são NaN, preenche com a moda global da coluna
                    if not df[column].mode().empty:
                        df.loc[date_mask, column] = df_date[column].fillna(df[column].mode().iloc[0])
                    else:
                        df.loc[date_mask, column] = df_date[column].fillna('Desconhecido')  # Ou outro valor padrão

print(df)

print(df.columns)

     Screening_date  Gender   Age  Personal_cancer_history  \
5901     2022-03-03     1.0  59.0                      0.0   
5895     2022-03-03     0.0  59.0                      0.0   
5896     2022-03-03     1.0  59.0                      0.0   
5897     2022-03-03     0.0  59.0                      0.0   
5898     2022-03-03     1.0  59.0                      0.0   
...             ...     ...   ...                      ...   
4481     2024-04-27     0.0  23.0                      0.0   
4480     2024-04-27     1.0  81.0                      1.0   
4479     2024-04-27     1.0  59.0                      1.0   
8049     2024-04-27     1.0  63.0                      0.0   
4436     2024-04-27     1.0  60.0                      0.0   

      Family_cancer_history  Sun_exposure  Body_signs  Premalignant  \
5901                    1.0           1.0         1.0             0   
5895                    0.0           1.0         0.0             1   
5896                    0.0           1.0 

In [5]:
df

Unnamed: 0,Screening_date,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6
5901,2022-03-03,1.0,59.0,0.0,1.0,1.0,1.0,0,0,0,0,0,0,0
5895,2022-03-03,0.0,59.0,0.0,0.0,1.0,0.0,1,0,0,1,0,0,0
5896,2022-03-03,1.0,59.0,0.0,0.0,1.0,1.0,0,0,0,1,0,0,0
5897,2022-03-03,0.0,59.0,0.0,0.0,1.0,0.0,0,0,0,1,0,0,0
5898,2022-03-03,1.0,59.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4481,2024-04-27,0.0,23.0,0.0,0.0,1.0,0.0,0,0,0,1,0,0,0
4480,2024-04-27,1.0,81.0,1.0,0.0,1.0,1.0,0,1,1,0,0,0,0
4479,2024-04-27,1.0,59.0,1.0,1.0,0.0,1.0,0,1,1,0,0,0,0
8049,2024-04-27,1.0,63.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0


In [6]:
# Criar uma coluna de semana no formato YYYY-WW
df['Week'] = df['Screening_date'].dt.strftime('%Y-%U')

# Função de agregação para numéricas (mediana)
def median_if_numeric(series):
    if pd.api.types.is_numeric_dtype(series):
        return series.median()
    else:
        return series

# Função de agregação para categóricas (moda)
def mode_if_not_numeric(series):
    if not pd.api.types.is_numeric_dtype(series):
        return series.mode().iloc[0]
    else:
        return series

# Agrupar por semana e aplicar as funções de agregação
grouped_df = df.groupby('Week').agg({
    column: median_if_numeric if pd.api.types.is_numeric_dtype(df[column]) else mode_if_not_numeric
    for column in df.columns if column != 'Screening_date'
})

# Exibir o resultado
grouped_df

Unnamed: 0_level_0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Week
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-09,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-09
2022-11,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11
2022-12,1.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-12
2022-14,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-14
2022-16,1.0,59.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11,1.0,61.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11
2024-12,1.0,58.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-12
2024-13,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-13
2024-15,1.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-15


In [7]:
# Copiar a coluna 'total_screenings_per_week' de weekly_counts para grouped_df
grouped_df['total_screenings_per_week'] = grouped_df['Week'].map(
    weekly_counts.set_index('Week')['total_screenings_per_week']
)

# Exibir o resultado
grouped_df

Unnamed: 0_level_0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Week,total_screenings_per_week
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-09,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-09,92
2022-11,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11,62
2022-12,1.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-12,90
2022-14,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-14,139
2022-16,1.0,59.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-16,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11,1.0,61.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11,168
2024-12,1.0,58.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-12,198
2024-13,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-13,242
2024-15,1.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-15,186


In [8]:
df = grouped_df

df

Unnamed: 0_level_0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Week,total_screenings_per_week
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-09,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-09,92
2022-11,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11,62
2022-12,1.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-12,90
2022-14,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-14,139
2022-16,1.0,59.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-16,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11,1.0,61.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11,168
2024-12,1.0,58.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-12,198
2024-13,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-13,242
2024-15,1.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-15,186


In [9]:
from statsmodels.tsa.stattools import adfuller

# Realizar o teste ADF na coluna 'total_screenings_per_week'
adf_result = adfuller(df['total_screenings_per_week'].dropna())

# Exibir os resultados do teste
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('Critical Values:')
for key, value in adf_result[4].items():
    print(f'   {key}: {value}')

ADF Statistic: -2.5089171433396316
p-value: 0.11331973560316377
Critical Values:
   1%: -3.524624466842421
   5%: -2.9026070739026064
   10%: -2.5886785262345677


In [10]:
df['total_screenings_diff'] = df['total_screenings_per_week'].diff().dropna()
adf_result_diff = adfuller(df['total_screenings_diff'].dropna())


In [11]:
# Realizar o teste ADF na coluna 'total_screenings_per_week'
adf_result = adfuller(df['total_screenings_diff'].dropna())

# Exibir os resultados do teste
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])
print('Critical Values:')
for key, value in adf_result[4].items():
    print(f'   {key}: {value}')

ADF Statistic: -10.191121840419315
p-value: 6.318315042041935e-18
Critical Values:
   1%: -3.524624466842421
   5%: -2.9026070739026064
   10%: -2.5886785262345677


In [12]:
df['total_screenings_diff']

Week
2022-09      NaN
2022-11    -30.0
2022-12     28.0
2022-14     49.0
2022-16     34.0
           ...  
2024-11   -240.0
2024-12     30.0
2024-13     44.0
2024-15    -56.0
2024-16     -2.0
Name: total_screenings_diff, Length: 75, dtype: float64

In [13]:
df.rename(columns={'total_screenings_diff': 'Total_screenings'}, inplace=True)

df

Unnamed: 0_level_0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Week,total_screenings_per_week,Total_screenings
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-09,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-09,92,
2022-11,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11,62,-30.0
2022-12,1.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2022-12,90,28.0
2022-14,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-14,139,49.0
2022-16,1.0,59.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-16,173,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11,1.0,61.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-11,168,-240.0
2024-12,1.0,58.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-12,198,30.0
2024-13,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-13,242,44.0
2024-15,1.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-15,186,-56.0


In [14]:
# Eliminar a coluna 'Week' e 'total_screenings_per_week'
df = df.drop(columns=['Week', 'total_screenings_per_week'])

df

Unnamed: 0_level_0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Total_screenings
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-09,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,
2022-11,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-30.0
2022-12,1.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0
2022-14,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
2022-16,1.0,59.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11,1.0,61.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-240.0
2024-12,1.0,58.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
2024-13,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0
2024-15,1.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-56.0


In [15]:
# Remover linhas onde Total_screenings contém NaN
df = df.dropna(subset=['Total_screenings'])

# Exibir o DataFrame para confirmar a remoção
df

Unnamed: 0_level_0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Total_screenings
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-11,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-30.0
2022-12,1.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0
2022-14,1.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
2022-16,1.0,59.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
2022-19,1.0,59.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-170.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11,1.0,61.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-240.0
2024-12,1.0,58.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
2024-13,1.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0
2024-15,1.0,61.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-56.0


In [16]:
# Resetar o índice para transformar 'Week' em uma coluna
df = df.reset_index()

# Verificar se 'Week' agora está como coluna
print(df.columns)

Index(['Week', 'Gender', 'Age', 'Personal_cancer_history',
       'Family_cancer_history', 'Sun_exposure', 'Body_signs', 'Premalignant',
       'Malignant', 'Phototype_2', 'Phototype_3', 'Phototype_4', 'Phototype_5',
       'Phototype_6', 'Total_screenings'],
      dtype='object')


# NORMALIZAÇÃO - X e y

GLM (Kernel), SVR (Gaussian Kernel), SVR (Linear Kernel), MLP NN, LSTM, e CNN

In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Supondo que você já tenha um DataFrame chamado 'df'
# Criando o scaler
scaler = MinMaxScaler()

df['Age'] = scaler.fit_transform(df[['Age']])

# Definir as features (X) e a variável target (y)
X = df.drop(columns=['Total_screenings', 'Week'])
y = df['Total_screenings']

In [18]:
df

Unnamed: 0,Week,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Premalignant,Malignant,Phototype_2,Phototype_3,Phototype_4,Phototype_5,Phototype_6,Total_screenings
0,2022-11,1.0,0.569444,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-30.0
1,2022-12,1.0,0.680556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0
2,2022-14,1.0,0.513889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
3,2022-16,1.0,0.569444,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0
4,2022-19,1.0,0.569444,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-170.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,2024-11,1.0,0.638889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-240.0
70,2024-12,1.0,0.541667,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
71,2024-13,1.0,0.430556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0
72,2024-15,1.0,0.625000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-56.0


# SEM NORMALIZAÇÃO - X e y

MLR, GLM GAUSSIAN, XGB

In [19]:
# Definir as features (X) e a variável target (y)
X = df.drop(columns=['Total_screenings', 'Week'])
y = df['Total_screenings']

In [20]:
# # Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [21]:
# Definindo a semana de corte
split_week = '2024-09'

# Dividindo os dados com base na coluna 'Week'
train = df[df['Week'] < split_week]
test = df[df['Week'] >= split_week]

# Separando X e y para cada conjunto
X_train = train.drop(columns=['Total_screenings', 'Week'])
y_train = train['Total_screenings']

X_test = test.drop(columns=['Total_screenings', 'Week'])
y_test = test['Total_screenings']

In [22]:
# Contar o número de elementos em y_train e y_test
num_y_train = len(y_train)
num_y_test = len(y_test)

print(f"Número de elementos em y_train: {num_y_train}")
print(f"Número de elementos em y_test: {num_y_test}")


Número de elementos em y_train: 68
Número de elementos em y_test: 6


# MLR MODEL

In [95]:
from sklearn.linear_model import LinearRegression

# Adicionar uma constante aos dados
X_train_sm = sm.add_constant(X_train)
X_test_sm = X_test.copy()  # Certifique-se de que não está sobrescrevendo X_test original
X_test_sm.insert(0, 'const', 1.0)

# print(X_train_sm.head())
# print(X_test_sm.head())

# Ajustar o modelo
model_sm = sm.OLS(y_train, X_train_sm).fit()

# Obter o resumo do modelo
print(model_sm.summary())

# Fazer previsões no conjunto de teste
y_pred = model_sm.predict(X_test_sm)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))


                            OLS Regression Results                            
Dep. Variable:       Total_screenings   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                 -0.064
Method:                 Least Squares   F-statistic:                    0.5972
Date:                Fri, 23 Aug 2024   Prob (F-statistic):              0.810
Time:                        17:42:05   Log-Likelihood:                -400.66
No. Observations:                  68   AIC:                             823.3
Df Residuals:                      57   BIC:                             847.7
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

  return np.sqrt(eigvals[0]/eigvals[-1])


In [96]:
print("Modelo: LR")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: LR shuffle
# MAE: 100.82002204390385
# MSE: 15017.547324429226
# RMSE: 122.54610285288237
# R2: 0.05693841956290402
# ME: -8.344664764002756
# MAV: 99.66666666666667
# MPV: 4.203501135043124
# RME: 4.203501135043124
# RMAE: 4.203501135043124

# Modelo: LR split date
# Modelo: LR
# MAE: 98.11117869284517
# MSE: 16105.554541966905
# RMSE: 126.90766147859989
# R2: 0.10228228484376078
# ME: -20.818932825708732
# MAV: 97.0
# MPV: 4.642402493902201
# RME: 4.642402493902201
# RMAE: 4.642402493902201

Modelo: LR
MAE: 98.11117869284517
MSE: 16105.554541966905
RMSE: 126.90766147859989
R2: 0.10228228484376078
ME: -20.818932825708732
MAV: 97.0
MPV: 4.642402493902201
RME: 4.642402493902201
RMAE: 4.642402493902201


# GLM MODEL

In [97]:
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gaussian
from sklearn.preprocessing import PolynomialFeatures

Gaussian

In [98]:
# Ajustar o modelo GLM com família Gaussian
model_glm = GLM(y_train, X_train_sm, family=Gaussian()).fit()

# Obter o resumo do modelo
print(model_glm.summary())

# Fazer previsões no conjunto de teste
y_pred = model_glm.predict(X_test_sm)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

                 Generalized Linear Model Regression Results                  
Dep. Variable:       Total_screenings   No. Observations:                   68
Model:                            GLM   Df Residuals:                       57
Model Family:                Gaussian   Df Model:                           10
Link Function:               Identity   Scale:                          9161.8
Method:                          IRLS   Log-Likelihood:                -400.66
Date:                Fri, 23 Aug 2024   Deviance:                   5.2222e+05
Time:                        17:42:22   Pearson chi2:                 5.22e+05
No. Iterations:                     3   Pseudo R-squ. (CS):            0.09743
Covariance Type:            nonrobust                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

In [99]:
print("Modelo: GLM com Gaussian")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: GLM com Gaussian shuffle
# Modelo: GLM com Gaussian
# MAE: 100.82002204390385
# MSE: 15017.547324429226
# RMSE: 122.54610285288237
# R2: 0.05693841956290402
# ME: -8.344664764002758
# MAV: 99.66666666666667
# MPV: 4.203501135043124
# RME: 4.203501135043124
# RMAE: 4.203501135043124

# Modelo: GLM com Gaussian split
# Modelo: GLM com Gaussian
# MAE: 98.11117869284517
# MSE: 16105.554541966905
# RMSE: 126.90766147859989
# R2: 0.10228228484376078
# ME: -20.818932825708732
# MAV: 97.0
# MPV: 4.642402493902201
# RME: 4.642402493902201
# RMAE: 4.642402493902201

Modelo: GLM com Gaussian
MAE: 98.11117869284517
MSE: 16105.554541966905
RMSE: 126.90766147859989
R2: 0.10228228484376078
ME: -20.818932825708732
MAV: 97.0
MPV: 4.642402493902201
RME: 4.642402493902201
RMAE: 4.642402493902201


Kernel

In [49]:
# Definir o grau do polinômio
degree = 2

# Criar o transformador polinomial
poly = PolynomialFeatures(degree)

# Ajustar e transformar os dados de treino
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Adicionar uma constante aos dados transformados
X_train_poly_sm = sm.add_constant(X_train_poly)
X_test_poly_sm = sm.add_constant(X_test_poly)

# print(X_train_poly_sm)
# print(X_test_poly_sm)

# Ajustar o modelo GLM com família Gaussian
model_glm = GLM(y_train, X_train_poly_sm, family=Gaussian()).fit()

# Obter o resumo do modelo
print(model_glm.summary())

# Fazer previsões no conjunto de teste
y_pred = model_glm.predict(X_test_poly_sm)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

                 Generalized Linear Model Regression Results                  
Dep. Variable:       Total_screenings   No. Observations:                   68
Model:                            GLM   Df Residuals:                       48
Model Family:                Gaussian   Df Model:                           19
Link Function:               Identity   Scale:                          8468.8
Method:                          IRLS   Log-Likelihood:                -392.15
Date:                Fri, 23 Aug 2024   Deviance:                   4.0650e+05
Time:                        17:37:20   Pearson chi2:                 4.07e+05
No. Iterations:                     3   Pseudo R-squ. (CS):             0.2954
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         40.1605    124.713      0.322      0.7

In [50]:
print("Modelo: GLM com Kernel Polinomial")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: GLM com Kernel Polinomial shuffle
# MAE: 137.8731200669713
# MSE: 33516.98054198307
# RMSE: 183.0764336062484
# R2: -1.1047762300028783
# ME: -90.1664214332277
# MAV: 99.66666666666667
# MPV: 19.718075068707176
# RME: 19.347213966759394
# RMAE: 19.718075068707176

# Modelo: GLM com Kernel Polinomial split 
# MAE: 158.80629656746916
# MSE: 39451.31219659482
# RMSE: 198.62354391308907
# R2: -1.1990017017270214
# ME: 9.475984187381556
# MAV: 97.0
# MPV: 14.110326680822126
# RME: -12.44826405527359
# RMAE: 14.110326680822126

Modelo: GLM com Kernel Polinomial
MAE: 158.80629656746916
MSE: 39451.31219659482
RMSE: 198.62354391308907
R2: -1.1990017017270214
ME: 9.475984187381556
MAV: 97.0
MPV: 14.110326680822126
RME: -12.44826405527359
RMAE: 14.110326680822126


# SVR MODEL

In [51]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler


Kernel gaussiano

In [52]:
# Definir o modelo SVR com kernel RBF
model_svr = SVR(kernel='rbf', C=1.0, gamma='scale')

# Ajustar o modelo
model_svr.fit(X_train, y_train)

# # "Summary" of the model
# print(f"Support Vectors: {model_svr.support_vectors_}")
# print(f"Number of Support Vectors: {model_svr.n_support_}")
# print(f"Dual Coefficients: {model_svr.dual_coef_}")

# Fazer previsões no conjunto de teste
y_pred = model_svr.predict(X_test)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

In [53]:
print("Modelo: SVR kernel gaussiano")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: SVR kernel gaussiano shuffle
# MAE: 100.29749408291882
# MSE: 15885.24117922586
# RMSE: 126.0366660112281
# R2: 0.00244957925081446
# ME: 1.967698728014363
# MAV: 99.66666666666667
# MPV: 1.5174367332404006
# RME: 1.5174367332404006
# RMAE: 1.5174367332404006

# Modelo: SVR kernel gaussiano split date
# MAE: 97.69176826165402
# MSE: 18075.364596769014
# RMSE: 134.44465254062362
# R2: -0.007514206613948193
# ME: -5.3604889805623825
# MAV: 97.0
# MPV: 1.2878628044257077
# RME: 1.2878628044257077
# RMAE: 1.2878628044257077


Modelo: SVR kernel gaussiano
MAE: 97.69176826165402
MSE: 18075.364596769014
RMSE: 134.44465254062362
R2: -0.007514206613948193
ME: -5.3604889805623825
MAV: 97.0
MPV: 1.2878628044257077
RME: 1.2878628044257077
RMAE: 1.2878628044257077


Linear kernel

In [54]:
# Definir o modelo SVR com kernel RBF
model_svr_linear = SVR(kernel='linear', C=1.0)

# Ajustar o modelo
model_svr_linear.fit(X_train, y_train)

# "Summary" of the model
# print(f"Support Vectors: {model_svr_linear.support_vectors_}")
# print(f"Number of Support Vectors: {model_svr_linear.n_support_}")
# print(f"Dual Coefficients: {model_svr_linear.dual_coef_}")

# Fazer previsões no conjunto de teste
y_pred = model_svr_linear.predict(X_test)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

In [55]:
print("Modelo: SVR linear")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: SVR linear shuffle
# Modelo: SVR linear
# MAE: 100.4077777113487
# MSE: 15931.70368046026
# RMSE: 126.2208527956465
# R2: -0.0004681408601050485
# ME: 1.4527781323804472
# MAV: 99.66666666666667
# MPV: 1.6721638812193804
# RME: 1.6721638812193804
# RMAE: 1.6721638812193804

# Modelo: SVR linear split date
# MAE: 97.8440292201284
# MSE: 18237.19780061764
# RMSE: 135.04516948272396
# R2: -0.016534730161699374
# ME: -6.563236190515018
# MAV: 97.0
# MPV: 1.3309046033581409
# RME: 1.3309046033581409
# RMAE: 1.3309046033581409

Modelo: SVR linear
MAE: 97.8440292201284
MSE: 18237.19780061764
RMSE: 135.04516948272396
R2: -0.016534730161699374
ME: -6.563236190515018
MAV: 97.0
MPV: 1.3309046033581409
RME: 1.3309046033581409
RMAE: 1.3309046033581409


# MLP NN MODEL

In [56]:
from sklearn.neural_network import MLPRegressor


In [57]:
# Definir o modelo MLP com uma camada oculta de 100 neurônios (você pode ajustar os parâmetros conforme necessário)
model_mlp = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Ajustar o modelo aos dados de treino
model_mlp.fit(X_train, y_train)

# "Summary" do modelo
# print(f"Coefs: {model_mlp.coefs_}")
# print(f"Intercepts: {model_mlp.intercepts_}")
# print(f"Número de iterações: {model_mlp.n_iter_}")

# Fazer previsões no conjunto de teste
y_pred = model_mlp.predict(X_test)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))



In [58]:
print("Modelo: MLP NN")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: MPL NN shuffle
# MAE: 104.74586804589926
# MSE: 14982.027550888033
# RMSE: 122.40109293175463
# R2: 0.05916896580649966
# ME: -19.107655431470075
# MAV: 99.66666666666667
# MPV: 6.290028347836006
# RME: 6.290028347836006
# RMAE: 6.290028347836006

# Modelo: MPL NN split date
# MAE: 102.09761434383269
# MSE: 18938.75159780517
# RMSE: 137.6181368781207
# R2: -0.055639082031688325
# ME: -27.02402616543922
# MAV: 97.0
# MPV: 4.239036765767595
# RME: 4.239036765767595
# RMAE: 4.239036765767595


Modelo: MLP NN
MAE: 102.09761434383269
MSE: 18938.75159780517
RMSE: 137.6181368781207
R2: -0.055639082031688325
ME: -27.02402616543922
MAV: 97.0
MPV: 4.239036765767595
RME: 4.239036765767595
RMAE: 4.239036765767595


# XGB MODEL

In [100]:
from xgboost import XGBRegressor

# Definir o modelo XGBoost
model_xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Ajustar o modelo aos dados de treino
model_xgb.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = model_xgb.predict(X_test)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

In [101]:
print("Modelo: XGB")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: XGB SHUFFLE
# MAE: 107.6011969923973
# MSE: 18559.084525095186
# RMSE: 136.23173097738714
# R2: -0.16546059123986367
# ME: -35.90161940654119
# MAV: 99.66666666666667
# MPV: 4.213940509892852
# RME: -1.7596556229443086
# RMAE: 4.213940509892852

# Modelo: XGB split
# MAE: 109.30374717712402
# MSE: 21007.55619467312
# RMSE: 144.93983646559397
# R2: -0.1709534930298091
# ME: -39.19437472025553
# MAV: 97.0
# MPV: 7.15078942297513
# RME: -5.917764262685185
# RMAE: 7.15078942297513

Modelo: XGB
MAE: 109.30374717712402
MSE: 21007.55619467312
RMSE: 144.93983646559397
R2: -0.1709534930298091
ME: -39.19437472025553
MAV: 97.0
MPV: 7.15078942297513
RME: -5.917764262685185
RMAE: 7.15078942297513


# LSTM MODEL

In [59]:
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import LSTM, Dense # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

In [60]:
# Certifique-se de que seus dados estejam no formato adequado para o LSTM
# LSTM espera os dados na forma de [samples, time_steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Definir o modelo LSTM
model_lstm = Sequential()
model_lstm.add(LSTM(units=50, return_sequences=False, input_shape=(X_train.shape[1], 1)))
model_lstm.add(Dense(1))  # Camada de saída

# Compilar o modelo
model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Ajustar o modelo aos dados de treino
model_lstm.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

# "Summary" do modelo
model_lstm.summary()

# Fazer previsões no conjunto de teste
y_pred = model_lstm.predict(X_test)

print(f"Shape of y_test: {y_test.shape}")
print(f"Shape of y_pred: {y_pred.shape}")

# Flatten y_pred to make it a 1D array
y_pred = y_pred.flatten()

Epoch 1/100


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 7293.5420
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8089.5801 
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8078.6460 
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8283.6348 
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8616.5527 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8064.2485 
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8746.6270 
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8569.5498 
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 9325.7910  
Epoch 10/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 83

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
Shape of y_test: (6,)
Shape of y_pred: (6, 1)


In [61]:
# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

print("Modelo: LSTM")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# Modelo: LSTM shuffle
# MAE: 99.47275692522526
# MSE: 15766.571441323033
# RMSE: 125.56500882540101
# R2: 0.009901719614284321
# ME: 6.375798971454302
# MAV: 99.66666666666667
# MPV: 1.1949665132207186
# RME: 1.1949665132207186
# RMAE: 1.1949665132207186

# Modelo: LSTM split
# MAE: 97.0014197776715
# MSE: 17916.47568740317
# RMSE: 133.85243997553115
# R2: 0.0013422030370139337
# ME: -2.869013632337252
# MAV: 97.0
# MPV: 1.0653718116544024
# RME: 1.0653718116544024
# RMAE: 1.0653718116544024



Modelo: LSTM
MAE: 97.0014197776715
MSE: 17916.47568740317
RMSE: 133.85243997553115
R2: 0.0013422030370139337
ME: -2.869013632337252
MAV: 97.0
MPV: 1.0653718116544024
RME: 1.0653718116544024
RMAE: 1.0653718116544024


# CNN MODEL

In [40]:
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

In [63]:
# Definindo a semana de corte
split_week = '2024-01'

# Dividindo os dados com base na coluna 'Week'
train = df[df['Week'] < split_week]
test = df[df['Week'] >= split_week]

# Separando X e y para cada conjunto
X_train = train.drop(columns=['Total_screenings', 'Week'])
y_train = train['Total_screenings']

X_test = test.drop(columns=['Total_screenings', 'Week'])
y_test = test['Total_screenings']

In [64]:
def create_sequences(data, labels, window_size):
    sequences = []
    sequence_labels = []
    
    for i in range(len(data) - window_size):
        seq = data.iloc[i:i+window_size].values
        label = labels.iloc[i+window_size]
        sequences.append(seq)
        sequence_labels.append(label)
    
    return np.array(sequences), np.array(sequence_labels)

# Tamanho da janela (número de passos de tempo a considerar)
window_size = 5

# Reformular X_train e y_train
X_train_seq, y_train_seq = create_sequences(X_train, y_train, window_size)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, window_size)

print(f"Shape de X_train_seq: {X_train_seq.shape}")
print(f"Shape de X_test_seq: {X_test_seq.shape}")
print(f"Shape de y_train_seq: {y_train_seq.shape}")
print(f"Shape de y_test_seq: {y_test_seq.shape}")

model_cnn = Sequential()

# Camada convolucional 1D
model_cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))

# Camada de pooling 1D
model_cnn.add(MaxPooling1D(pool_size=2))

# Achatar a saída e adicionar camadas densas
model_cnn.add(Flatten())
model_cnn.add(Dense(50, activation='relu'))
model_cnn.add(Dropout(0.5))

# Camada de saída
model_cnn.add(Dense(1, activation='linear'))

# Compilar o modelo
model_cnn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Treinar o modelo
model_cnn.fit(X_train_seq, y_train_seq, epochs=100, batch_size=32, validation_data=(X_test_seq, y_test_seq), verbose=1)

# Fazer previsões no conjunto de teste
y_pred = model_cnn.predict(X_test_seq)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test_seq, y_pred)
mse = mean_squared_error(y_test_seq, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_seq, y_pred)
me = np.mean(y_test_seq - y_pred)
mav = np.mean(np.abs(y_test_seq))
mpv = np.mean(np.abs((y_test_seq - y_pred) / y_test_seq))
rme = np.mean((y_test_seq - y_pred) / y_test_seq)
rmae = np.mean(np.abs(y_test_seq - y_pred) / np.abs(y_test_seq))

Shape de X_train_seq: (55, 5, 13)
Shape de X_test_seq: (9, 5, 13)
Shape de y_train_seq: (55,)
Shape de y_test_seq: (9,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 199ms/step - loss: 7726.3462 - val_loss: 14931.9219
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 7139.2134 - val_loss: 14932.1875
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 8251.4014 - val_loss: 14932.2881
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 7655.4883 - val_loss: 14932.6182
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 7231.5449 - val_loss: 14933.1006
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 6976.0493 - val_loss: 14933.1836
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 7086.2466 - val_loss: 14933.0820
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 7521.5278 - val_loss: 14932.7988

In [65]:
print("Modelo: CNN com split própria")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

# shuffle
# MAE: 92.1615381911397
# MSE: 14453.156095361195
# RMSE: 120.22127971104447
# R2: -0.008338153836791928
# ME: -9.639781267940998
# MAV: 91.4
# MPV: 1.0733584119479516
# RME: 0.8880936934153892
# RMAE: 1.0733584119479516

# Modelo: CNN com split própria
# MAE: 90.7757566107644
# MSE: 14896.679295629301
# RMSE: 122.05195326429357
# R2: 0.002574901718234046
# ME: 0.2854777971903483
# MAV: 90.44444444444444
# MPV: 0.9787419252062746
# RME: 0.8814259367833628
# RMAE: 0.9787419252062746

Modelo: CNN com split própria
MAE: 90.7757566107644
MSE: 14896.679295629301
RMSE: 122.05195326429357
R2: 0.002574901718234046
ME: 0.2854777971903483
MAV: 90.44444444444444
MPV: 0.9787419252062746
RME: 0.8814259367833628
RMAE: 0.9787419252062746
