# Análisis y predicción de flujos de movilidad vacacional en España Peninsula-Islas

## 0. Installs and Imports

In [None]:
# Installs
#%pip install -q -U matplotlib numpy pandas scikit-learn seaborn
#%pip install xgboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load Data

In [None]:
path_2022 = "../data/movilidad_provincias_2022.csv"
path_2023 = "../data/movilidad_provincias_2023.csv"
path_2024 = "../data/movilidad_provincias_2024.csv"

original_data_2022 = pd.read_csv(path_2022, sep=",")
original_data_2023 = pd.read_csv(path_2023, sep=",")
original_data_2024 = pd.read_csv(path_2024, sep=",")

df_2022 = original_data_2022.copy()
df_2023 = original_data_2023.copy()
df_2024 = original_data_2024.copy()

df = pd.concat([df_2022, df_2023, df_2024])

In [None]:
""" df_2022.info()
df_2022.head() """

## 2. Data Preparation

### 2.1 Dataset Exploratory Data Analysis (EDA)

In [None]:
# TODO: EDA (passar aqui del notebook del eda)

### 2.2 Data Wrangling
All this process is known as **Data Wrangling**. In particular, the whole data wrangling process implies:
- Define and apply an strategy for nulls and coding for categorical variables
- Analyze the variables distribution and correlation between them
- Remove outliers
- etc....

In [None]:
# Data cleaning

In [None]:
# DATA FILTERING
# Keep only the rows with destination province = ['Balears, Illes', 'Palmas, Las', 'Santa Cruz de Tenerife']
insular_provinces = ['Balears, Illes', 'Palmas, Las', 'Santa Cruz de Tenerife']
df = df[df['provincia_destino_name'].isin(insular_provinces)]
df

## 3. Feature extraction

In [None]:
features_df = df.copy()

# Add two new columns, day_of_week and month
features_df['date'] = pd.to_datetime(features_df['day'])
features_df['day_of_week'] = features_df['date'].dt.day_name()
features_df['month'] = features_df['date'].dt.month
features_df['year'] = features_df['date'].dt.year
features_df.drop(columns=['day'], inplace=True)

features_df

In [None]:
# Sum all the trips to the same destination province
total_llegadas_islas = features_df.groupby(['date', 'provincia_destino_name', 'day_of_week', 'month', 'year'])['viajes'].sum().reset_index()
total_llegadas_islas

In [None]:
# Export to csv for the web app
total_llegadas_islas.to_csv('web/model_data.csv', index=False)

## 4. Model

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df1 = total_llegadas_islas.copy()
df1.drop(columns=['date'], inplace=True)
df1.drop(columns=['year'], inplace=True)

In [None]:
# Label encoding
le_day_of_week = LabelEncoder()
df1['day_of_week'] = le_day_of_week.fit_transform(df1['day_of_week'])
le_provincia_destino_name = LabelEncoder()
df1['provincia_destino_name'] = le_provincia_destino_name.fit_transform(df1['provincia_destino_name'])

# Correlation matrix
""" plt.figure(figsize=(12, 10))
sns.heatmap(df1.corr(), annot=True, cmap='coolwarm')
plt.show() """

In [None]:
features = df1.drop(columns=['viajes'])
target = df1['viajes']

X = features
y = target

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
len(X_train), len(X_test), len(y_train), len(y_test)

### 4.1 Model Selection

In [None]:
# TODO: Train with best model
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# XGBOOST
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

In [None]:
# RANDOM FOREST REGRESSOR
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
y_pred_rfr = rfr.predict(X_test)

In [None]:
# GRADIENT BOOSTING REGRESSOR
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)

In [None]:
# Predicción de datos fake
fake_data = pd.DataFrame({'provincia_destino_name': ['Balears, Illes'], 'day_of_week': ['Friday'], 'month': [1]})

fake_data['day_of_week'] = le_day_of_week.transform(fake_data['day_of_week'])
fake_data['provincia_destino_name'] = le_provincia_destino_name.transform(fake_data['provincia_destino_name'])

fake_data

y_pred_xgb_fake = xgb.predict(fake_data)
y_pred_rfr_fake = rfr.predict(fake_data)
y_pred_gbr_fake = gbr.predict(fake_data)

print(f"XGBoost: {y_pred_xgb_fake}")
print(f"Random Forest Regressor: {y_pred_rfr_fake}")
print(f"Gradient Boosting Regressor: {y_pred_gbr_fake}")

### 4.2 Tuning

In [None]:
# TODO: Tuning

## 5. Evaluation and Conclusions

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# Evaluation metrics
mse_rf = mean_squared_error(y_test, y_pred_rfr)
mae_rf = mean_absolute_error(y_test, y_pred_rfr)
r2_rf = r2_score(y_test, y_pred_rfr)

mse_gbr = mean_squared_error(y_test, y_pred_gbr)
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("----------------------")
print(f"Random Forest Regressor: MSE = {mse_rf}, MAE = {mae_rf}, R2 = {r2_rf}")
print(f"Gradient Boosting Regressor: MSE = {mse_gbr}, MAE = {mae_gbr}, R2 = {r2_gbr}")
print(f"XGBoost: MSE = {mse_xgb}, MAE = {mae_xgb}, R2 = {r2_xgb}")
print("----------------------")

In [None]:
# Feature importances


#print(feature_importances)

In [None]:
# EXPLAINABILITY OF THE MODEL (ELI5, LIME, SHAP) ...

In [None]:
# For classifier: Precision, recall, f1-score, accuracy, confusion matrix, DENSITY CHARTS, ...
# For regressor: R2, RMSE, MAE, ...


In [None]:
# Confusion matrices
""" from sklearn.metrics import confusion_matrix

confusion_matrix_lr = confusion_matrix(y_test, y_pred_lr)
confusion_matrix_gbr = confusion_matrix(y_test, y_pred_gbr)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)
confusion_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

fig, ax = plt.subplots(1, 4, figsize=(20, 5))
sns.heatmap(confusion_matrix_lr, annot=True, cmap='coolwarm', ax=ax[0])
sns.heatmap(confusion_matrix_gbr, annot=True, cmap='coolwarm', ax=ax[1])
sns.heatmap(confusion_matrix_rf, annot=True, cmap='coolwarm', ax=ax[2])
sns.heatmap(confusion_matrix_xgb, annot=True, cmap='coolwarm', ax=ax[3])
ax[0].set_title("Linear Regression")
ax[1].set_title("Gradient Boosting Regressor")
ax[2].set_title("Random Forest")
ax[3].set_title("XGBoost")
plt.show() """