In [174]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import statsmodels.api as sm
from fancyimpute import IterativeImputer

In [175]:
# Carregar o DataFrame original
df = pd.read_excel('C:/Users/Patyc/OneDrive/Desktop/Dissertation/Data/Merged_File_v7_skin_clean.xlsx')

# Converter a data para datetime
df['Screening_date'] = pd.to_datetime(df['Screening_date'])


In [176]:
# Create a new DataFrame df_date with only the 'Screening_date' column from df
df_date = df[['Screening_date']]

# Print the new df_date DataFrame
print(df_date.head())

  Screening_date
0     2022-07-23
1     2022-07-23
2     2022-07-23
3     2022-07-23
4     2022-07-23


In [177]:
df = df.drop('Screening_date', axis=1)


In [178]:
# Drop the last 4 columns
df = df.iloc[:, :-4]

# Print the updated DataFrame
print(df.head())

  Gender   Age Personal_cancer_history Family_cancer_history Sun_exposure  \
0      f  53.0                       n                     y            y   
1      f  35.0                       n                     y            y   
2      f  47.0                       n                     y            n   
3      f  54.0                       n                     y            n   
4      f  71.0                       y                     y            n   

  Body_signs  Phototype  Skin_diagnosis  
0          y        2.0             2.0  
1          y        3.0             2.0  
2          n        3.0             1.0  
3          y        4.0             1.0  
4          y        3.0             2.0  


In [179]:
df

Unnamed: 0,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Phototype,Skin_diagnosis
0,f,53.0,n,y,y,y,2.0,2.0
1,f,35.0,n,y,y,y,3.0,2.0
2,f,47.0,n,y,n,n,3.0,1.0
3,f,54.0,n,y,n,y,4.0,1.0
4,f,71.0,y,y,n,y,3.0,2.0
...,...,...,...,...,...,...,...,...
8389,m,51.0,n,n,y,n,3.0,1.0
8390,m,74.0,n,n,y,n,3.0,1.0
8391,f,62.0,y,y,y,n,3.0,1.0
8392,f,78.0,n,n,n,y,3.0,1.0


In [180]:
# Assuming df is your DataFrame
columns_to_convert = ['Personal_cancer_history', 'Family_cancer_history', 'Sun_exposure', 'Body_signs']

# Replace 'y'/'e' with 1 and 'n' with 0
df[columns_to_convert] = df[columns_to_convert].replace({'y': 1, 'e': 1, 'n': 0})

# Print the modified DataFrame
print(df.head())

  Gender   Age  Personal_cancer_history  Family_cancer_history  Sun_exposure  \
0      f  53.0                      0.0                    1.0           1.0   
1      f  35.0                      0.0                    1.0           1.0   
2      f  47.0                      0.0                    1.0           0.0   
3      f  54.0                      0.0                    1.0           0.0   
4      f  71.0                      1.0                    1.0           0.0   

   Body_signs  Phototype  Skin_diagnosis  
0         1.0        2.0             2.0  
1         1.0        3.0             2.0  
2         0.0        3.0             1.0  
3         1.0        4.0             1.0  
4         1.0        3.0             2.0  


  df[columns_to_convert] = df[columns_to_convert].replace({'y': 1, 'e': 1, 'n': 0})


In [181]:
# Convert 'f' to 1 and 'm' to 0 in the 'Gender' column
df['Gender'] = df['Gender'].replace({'f': 1, 'm': 0})

# Print the modified DataFrame
print(df.head())


   Gender   Age  Personal_cancer_history  Family_cancer_history  Sun_exposure  \
0     1.0  53.0                      0.0                    1.0           1.0   
1     1.0  35.0                      0.0                    1.0           1.0   
2     1.0  47.0                      0.0                    1.0           0.0   
3     1.0  54.0                      0.0                    1.0           0.0   
4     1.0  71.0                      1.0                    1.0           0.0   

   Body_signs  Phototype  Skin_diagnosis  
0         1.0        2.0             2.0  
1         1.0        3.0             2.0  
2         0.0        3.0             1.0  
3         1.0        4.0             1.0  
4         1.0        3.0             2.0  


  df['Gender'] = df['Gender'].replace({'f': 1, 'm': 0})


In [182]:
# Print the total number of NaNs per column
print(df.isna().sum())

# Alternatively, print the locations of NaNs in the DataFrame
print(df[df.isna().any(axis=1)])

Gender                      806
Age                        2207
Personal_cancer_history    1806
Family_cancer_history      1950
Sun_exposure               2022
Body_signs                 1909
Phototype                  3283
Skin_diagnosis              483
dtype: int64
      Gender   Age  Personal_cancer_history  Family_cancer_history  \
13       0.0  40.0                      1.0                    0.0   
28       1.0  66.0                      NaN                    NaN   
29       1.0  67.0                      NaN                    NaN   
31       0.0  57.0                      1.0                    NaN   
63       1.0  50.0                      1.0                    NaN   
...      ...   ...                      ...                    ...   
8243     0.0  77.0                      1.0                    0.0   
8244     1.0  49.0                      0.0                    0.0   
8245     1.0   NaN                      0.0                    0.0   
8246     1.0  79.0             

In [183]:
# Impute using IterativeImputer with PMM in fancyimpute
imputer = IterativeImputer(imputation_order='random', random_state=0)
imputed_data = imputer.fit_transform(df)

# Convert the result back to a DataFrame
imputed_data = pd.DataFrame(imputed_data, columns=df.columns)

print("Dataset after PMM Imputation:")
print(imputed_data)

Dataset after PMM Imputation:
      Gender   Age  Personal_cancer_history  Family_cancer_history  \
0        1.0  53.0                      0.0                    1.0   
1        1.0  35.0                      0.0                    1.0   
2        1.0  47.0                      0.0                    1.0   
3        1.0  54.0                      0.0                    1.0   
4        1.0  71.0                      1.0                    1.0   
...      ...   ...                      ...                    ...   
8389     0.0  51.0                      0.0                    0.0   
8390     0.0  74.0                      0.0                    0.0   
8391     1.0  62.0                      1.0                    1.0   
8392     1.0  78.0                      0.0                    0.0   
8393     1.0  70.0                      0.0                    0.0   

      Sun_exposure  Body_signs  Phototype  Skin_diagnosis  
0              1.0         1.0        2.0             2.0  
1        

In [184]:
# Print the total number of NaNs per column
print(imputed_data.isna().sum())

# Alternatively, print the locations of NaNs in the DataFrame
print(imputed_data[imputed_data.isna().any(axis=1)])


Gender                     0
Age                        0
Personal_cancer_history    0
Family_cancer_history      0
Sun_exposure               0
Body_signs                 0
Phototype                  0
Skin_diagnosis             0
dtype: int64
Empty DataFrame
Columns: [Gender, Age, Personal_cancer_history, Family_cancer_history, Sun_exposure, Body_signs, Phototype, Skin_diagnosis]
Index: []


In [185]:
print(df.columns)  # Check if 'Screening_date' is in the DataFrame


Index(['Gender', 'Age', 'Personal_cancer_history', 'Family_cancer_history',
       'Sun_exposure', 'Body_signs', 'Phototype', 'Skin_diagnosis'],
      dtype='object')


In [186]:
# Create a new DataFrame with 'Screening_date' from df and all columns from imputed_data
new_df = pd.concat([df_date[['Screening_date']], imputed_data], axis=1)

# Print the new DataFrame
print(new_df.head())


  Screening_date  Gender   Age  Personal_cancer_history  \
0     2022-07-23     1.0  53.0                      0.0   
1     2022-07-23     1.0  35.0                      0.0   
2     2022-07-23     1.0  47.0                      0.0   
3     2022-07-23     1.0  54.0                      0.0   
4     2022-07-23     1.0  71.0                      1.0   

   Family_cancer_history  Sun_exposure  Body_signs  Phototype  Skin_diagnosis  
0                    1.0           1.0         1.0        2.0             2.0  
1                    1.0           1.0         1.0        3.0             2.0  
2                    1.0           0.0         0.0        3.0             1.0  
3                    1.0           0.0         1.0        4.0             1.0  
4                    1.0           0.0         1.0        3.0             2.0  


In [187]:
df = new_df

In [188]:
df

Unnamed: 0,Screening_date,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Phototype,Skin_diagnosis
0,2022-07-23,1.0,53.0,0.0,1.0,1.0,1.0,2.0,2.0
1,2022-07-23,1.0,35.0,0.0,1.0,1.0,1.0,3.0,2.0
2,2022-07-23,1.0,47.0,0.0,1.0,0.0,0.0,3.0,1.0
3,2022-07-23,1.0,54.0,0.0,1.0,0.0,1.0,4.0,1.0
4,2022-07-23,1.0,71.0,1.0,1.0,0.0,1.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...
8389,2024-03-16,0.0,51.0,0.0,0.0,1.0,0.0,3.0,1.0
8390,2024-03-16,0.0,74.0,0.0,0.0,1.0,0.0,3.0,1.0
8391,2024-03-16,1.0,62.0,1.0,1.0,1.0,0.0,3.0,1.0
8392,2024-03-16,1.0,78.0,0.0,0.0,0.0,1.0,3.0,1.0


In [189]:
# Criar um DataFrame com o total de rastreios por data
total_screenings_df = df.groupby('Screening_date').size().reset_index(name='Total_screenings')

total_screenings_df


Unnamed: 0,Screening_date,Total_screenings
0,2022-03-03,92
1,2022-03-19,62
2,2022-03-26,90
3,2022-04-04,69
4,2022-04-09,70
...,...,...
85,2024-03-23,168
86,2024-03-30,198
87,2024-04-06,242
88,2024-04-20,186


In [190]:
# Criar um intervalo de datas completo
min_date = df['Screening_date'].min()
max_date = df['Screening_date'].max()
date_range = pd.date_range(start=min_date, end=max_date)

# Criar um DataFrame com todas as datas do intervalo
all_dates_df = pd.DataFrame(date_range, columns=['Screening_date'])

# Fazer o merge de total_screenings_df com all_dates_df
complete_screenings_df = all_dates_df.merge(total_screenings_df, on='Screening_date', how='left')

# Substituir os valores ausentes por 0
complete_screenings_df['Total_screenings'] = complete_screenings_df['Total_screenings'].fillna(0).astype(int)

print(complete_screenings_df)

    Screening_date  Total_screenings
0       2022-03-03                92
1       2022-03-04                 0
2       2022-03-05                 0
3       2022-03-06                 0
4       2022-03-07                 0
..             ...               ...
782     2024-04-23                 0
783     2024-04-24                 0
784     2024-04-25                 0
785     2024-04-26                 0
786     2024-04-27               184

[787 rows x 2 columns]


In [191]:
# Agrupar dados por 'Screening_date' para remover duplicatas
aggregations = {col: (lambda x: x.mode()[0] if x.dtype == 'O' else x.median()) for col in df.columns if col != 'Screening_date'}
df = df.groupby('Screening_date').agg(aggregations).reset_index()

# Criar um intervalo de datas completo
min_date = df['Screening_date'].min()
max_date = df['Screening_date'].max()
date_range = pd.date_range(start=min_date, end=max_date)

# Reindexar o DataFrame para incluir todas as datas
df = df.set_index('Screening_date').reindex(date_range).reset_index().rename(columns={'index': 'Screening_date'})

# Preencher valores ausentes após a reindexação
for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        df[column] = df[column].fillna(df[column].median())
    else:
        df[column] = df[column].fillna(df[column].mode().iloc[0])

# Agrupar os dados por dias, usando mediana para numéricas e moda para categóricas
aggregations = {col: (lambda x: x.mode()[0] if x.dtype == 'O' else x.median()) for col in df.columns if col != 'Screening_date'}

df_grouped = df.groupby('Screening_date').agg(aggregations).reset_index()

df_grouped

Unnamed: 0,Screening_date,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Phototype,Skin_diagnosis
0,2022-03-03,1.0,57.277371,0.0,0.0,1.000000,0.811951,3.000000,1.0
1,2022-03-04,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
2,2022-03-05,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
3,2022-03-06,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
4,2022-03-07,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
...,...,...,...,...,...,...,...,...,...
782,2024-04-23,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
783,2024-04-24,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
784,2024-04-25,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0
785,2024-04-26,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0


In [192]:
# Fazer o merge dos dois DataFrames na coluna 'Screening_date'
df = pd.merge(df_grouped, complete_screenings_df[['Screening_date', 'Total_screenings']], on='Screening_date', how='left')

# Verificar o resultado
df


Unnamed: 0,Screening_date,Gender,Age,Personal_cancer_history,Family_cancer_history,Sun_exposure,Body_signs,Phototype,Skin_diagnosis,Total_screenings
0,2022-03-03,1.0,57.277371,0.0,0.0,1.000000,0.811951,3.000000,1.0,92
1,2022-03-04,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
2,2022-03-05,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
3,2022-03-06,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
4,2022-03-07,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
...,...,...,...,...,...,...,...,...,...,...
782,2024-04-23,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
783,2024-04-24,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
784,2024-04-25,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0
785,2024-04-26,1.0,58.042687,0.0,0.0,0.484602,0.588435,2.723301,1.0,0


In [193]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Supondo que você já tenha um DataFrame chamado 'df'
# Criando o scaler
scaler = MinMaxScaler()

df['Age'] = scaler.fit_transform(df[['Age']])

# Definir as features (X) e a variável target (y)
X = df.drop(columns=['Screening_date', 'Total_screenings'])
y = df['Total_screenings']

In [194]:
# Definir as features (X) e a variável target (y)
X = df.drop(columns=['Screening_date', 'Total_screenings'])
y = df['Total_screenings']

In [195]:
# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [196]:
# Definindo as datas de corte
split_date = '2024-03-01'

# Dividindo os dados com base na data
train = df[df['Screening_date'] < split_date]
test = df[df['Screening_date'] >= split_date]

# Separando X e y para cada conjunto
X_train = train.drop(columns=['Total_screenings', 'Screening_date'])
y_train = train['Total_screenings']

X_test = test.drop(columns=['Total_screenings', 'Screening_date'])
y_test = test['Total_screenings']

In [197]:
from sklearn.linear_model import LinearRegression

# Adicionar uma constante aos dados
X_train_sm = sm.add_constant(X_train)
X_test_sm = X_test.copy()  # Certifique-se de que não está sobrescrevendo X_test original
X_test_sm.insert(0, 'const', 1.0)

# print(X_train_sm.head())
# print(X_test_sm.head())

# Ajustar o modelo
model_sm = sm.OLS(y_train, X_train_sm).fit()

# Obter o resumo do modelo
print(model_sm.summary())

# Fazer previsões no conjunto de teste
y_pred = model_sm.predict(X_test_sm)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

                            OLS Regression Results                            
Dep. Variable:       Total_screenings   R-squared:                       0.321
Model:                            OLS   Adj. R-squared:                  0.313
Method:                 Least Squares   F-statistic:                     42.52
Date:                Thu, 05 Sep 2024   Prob (F-statistic):           8.60e-56
Time:                        12:07:30   Log-Likelihood:                -3405.0
No. Observations:                 729   AIC:                             6828.
Df Residuals:                     720   BIC:                             6869.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                    -

In [198]:
print("Modelo: MLR")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')


Modelo: MLR
MAE: 28.07803486797204
MSE: 5087.666312050839
RMSE: 71.32787892578077
R2: 0.17309051391191244
ME: 18.1240339801208
MAV: 27.310344827586206
MPV: inf
RME: -inf
RMAE: inf


In [199]:
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gaussian
from sklearn.preprocessing import PolynomialFeatures

In [200]:
# Ajustar o modelo GLM com família Gaussian
model_glm = GLM(y_train, X_train_sm, family=Gaussian()).fit()

# Obter o resumo do modelo
print(model_glm.summary())

# Fazer previsões no conjunto de teste
y_pred = model_glm.predict(X_test_sm)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))


                 Generalized Linear Model Regression Results                  
Dep. Variable:       Total_screenings   No. Observations:                  729
Model:                            GLM   Df Residuals:                      720
Model Family:                Gaussian   Df Model:                            8
Link Function:               Identity   Scale:                          676.01
Method:                          IRLS   Log-Likelihood:                -3405.0
Date:                Thu, 05 Sep 2024   Deviance:                   4.8673e+05
Time:                        12:07:30   Pearson chi2:                 4.87e+05
No. Iterations:                     3   Pseudo R-squ. (CS):             0.3729
Covariance Type:            nonrobust                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                    -

In [201]:
print("Modelo: GLM com Gaussian")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

Modelo: GLM com Gaussian
MAE: 28.078034867972026
MSE: 5087.66631205084
RMSE: 71.32787892578077
R2: 0.17309051391191232
ME: 18.12403398012082
MAV: 27.310344827586206
MPV: inf
RME: -inf
RMAE: inf


In [202]:
# Definir o grau do polinômio
degree = 2

# Criar o transformador polinomial
poly = PolynomialFeatures(degree)

# Ajustar e transformar os dados de treino
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Adicionar uma constante aos dados transformados
X_train_poly_sm = sm.add_constant(X_train_poly)
X_test_poly_sm = sm.add_constant(X_test_poly)

# Ajustar o modelo GLM com família Gaussian
model_glm = GLM(y_train, X_train_poly_sm, family=Gaussian()).fit()

# Obter o resumo do modelo
print(model_glm.summary())

# Fazer previsões no conjunto de teste
y_pred = model_glm.predict(X_test_poly_sm)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

                 Generalized Linear Model Regression Results                  
Dep. Variable:       Total_screenings   No. Observations:                  729
Model:                            GLM   Df Residuals:                      684
Model Family:                Gaussian   Df Model:                           44
Link Function:               Identity   Scale:                          171.70
Method:                          IRLS   Log-Likelihood:                -2886.8
Date:                Thu, 05 Sep 2024   Deviance:                   1.1745e+05
Time:                        12:07:30   Pearson chi2:                 1.17e+05
No. Iterations:                     3   Pseudo R-squ. (CS):             0.9917
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.068e+04   5.47e+04     -1.110      0.2

In [203]:
print("Modelo: GLM com kernel polinomial")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

Modelo: GLM com kernel polinomial
MAE: 17.556079415761534
MSE: 3499.6578958250366
RMSE: 59.15790645235036
R2: 0.4311929803127621
ME: 16.879892808530837
MAV: 27.310344827586206
MPV: inf
RME: -inf
RMAE: inf


In [204]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [205]:
# Definir o modelo SVR com kernel RBF
model_svr = SVR(kernel='rbf', C=1.0, gamma='scale')

# Ajustar o modelo
model_svr.fit(X_train, y_train)

# "Summary" of the model
print(f"Support Vectors: {model_svr.support_vectors_}")
print(f"Number of Support Vectors: {model_svr.n_support_}")
print(f"Dual Coefficients: {model_svr.dual_coef_}")

# Fazer previsões no conjunto de teste
y_pred = model_svr.predict(X_test)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

Support Vectors: [[1.         0.52159363 0.         ... 0.81195127 3.         1.        ]
 [0.69967708 0.50814959 0.14266    ... 0.55829417 2.72330097 1.        ]
 [1.         0.68055556 0.         ... 0.         3.         1.        ]
 ...
 [1.         0.54285241 0.         ... 0.58843533 2.72330097 1.        ]
 [1.         0.54285241 0.         ... 0.58843533 2.72330097 1.        ]
 [1.         0.54285241 0.         ... 0.58843533 2.72330097 1.        ]]
Number of Support Vectors: [162]
Dual Coefficients: [[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1. -1. -1. -1. -1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
  -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
  -1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -

In [168]:
print("Modelo: SVR")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

Modelo: SVR
MAE: 27.38269255370162
MSE: 6891.675921646413
RMSE: 83.01611844483223
R2: -0.12011909687471212
ME: 27.206830457300686
MAV: 27.310344827586206
MPV: inf
RME: -inf
RMAE: inf


In [169]:
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import LSTM, Dense # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

In [170]:
# Certifique-se de que seus dados estejam no formato adequado para o LSTM
# LSTM espera os dados na forma de [samples, time_steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Definir o modelo LSTM
model_lstm = Sequential()
model_lstm.add(LSTM(units=50, return_sequences=False, input_shape=(X_train.shape[1], 1)))
model_lstm.add(Dense(1))  # Camada de saída

# Compilar o modelo
model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Ajustar o modelo aos dados de treino
model_lstm.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

# "Summary" do modelo
model_lstm.summary()

# Fazer previsões no conjunto de teste
y_pred = model_lstm.predict(X_test)

print(f"Shape of y_test: {y_test.shape}")
print(f"Shape of y_pred: {y_pred.shape}")

# Flatten y_pred to make it a 1D array
y_pred = y_pred.flatten()

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))


Epoch 1/100


  super().__init__(**kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 1209.8391
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1384.9424
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 869.0142
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 815.1711
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 901.5303
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 804.3683 
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1186.9628
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 807.6346
Epoch 9/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 980.1408 
Epoch 10/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - l

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 544ms/step
Shape of y_test: (58,)
Shape of y_pred: (58, 1)


In [171]:
print("Modelo: LSTM")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')


Modelo: LSTM
MAE: 34.40834980997546
MSE: 6470.041286937493
RMSE: 80.43656685200763
R2: -0.0515897274017334
ME: 17.938379402818352
MAV: 27.310344827586206
MPV: inf
RME: -inf
RMAE: inf


In [206]:
from xgboost import XGBRegressor

In [207]:
# Definir o modelo XGBoost
model_xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Ajustar o modelo aos dados de treino
model_xgb.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = model_xgb.predict(X_test)

# Calcular e imprimir as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
me = np.mean(y_test - y_pred)
mav = np.mean(np.abs(y_test))
mpv = np.mean(np.abs((y_test - y_pred) / y_test))
rme = np.mean((y_test - y_pred) / y_test)
rmae = np.mean(np.abs(y_test - y_pred) / np.abs(y_test))

In [208]:
print("Modelo: XGB")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')
print(f'ME: {me}')
print(f'MAV: {mav}')
print(f'MPV: {mpv}')
print(f'RME: {rme}')
print(f'RMAE: {rmae}')

Modelo: XGB
MAE: 15.454612916869369
MSE: 2866.5059781784767
RMSE: 53.539760722088374
R2: 0.5341006517410278
ME: 15.454612916869369
MAV: 27.310344827586206
MPV: inf
RME: inf
RMAE: inf
