## S4

Data source Spain's National Statistics Institute (INE) on tourism activity
https://www.ine.es/dynt3/inebase/index.htm?padre=239

Business Question:  
Adjust business strategy (pricing, promotions, services) based on tourism trends from INE (Spain’s National Statistics Institute) to better match traveler demand.  

• Is it necessary to adjust our offerings to the traveler profile and the demand for overnight stays in the cities where we are present, considering the official figures on origin, months of visit, and average overnight stays per autonomous city?

https://www.ine.es/jaxiT3/Tabla.htm?t=2941&L=0

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Ruta del archivo
file_path = r"../Data/INE_Encuesta_de_ocupación_hotelera_2074.csv"

# Leer el archivo CSV correctamente
df = pd.read_csv(file_path, sep=";", encoding="utf-8-sig")

# Mostrar las primeras filas para verificar
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '../Data/INE_Encuesta_de_ocupación_hotelera_2074.csv'

In [None]:
df.info()

In [None]:
df

In [None]:
# Get distinct values and their counts for each column (from 2022 onwards)
print("Distinct value counts for each column (from 2022):\n")
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].value_counts(dropna=False), "\n")

In [None]:
# Group by CCAA and Provincias, count occurrences
group_ccaa_prov = df.groupby(['Comunidades y Ciudades Autónomas', 'Provincias']).size().reset_index(name='Count')

# Show first few rows
print(group_ccaa_prov.head())

# Save to CSV
group_ccaa_prov.to_csv('group_ccaa_prov.csv', index=False, encoding='utf-8-sig')

In [None]:
# Clean and convert 'Total' column to float
df['Total'] = (
    df['Total']
    .astype(str)
    .str.replace('.', '', regex=False)
    .replace('', np.nan)
    .astype(float)
)

# Drop rows with Totals for both travellers origen and rename
df = df.dropna(subset=['Residencia: Nivel 2'])
df = df.rename(columns={'Residencia: Nivel 2': 'Traveler Origin'})

# Drop column same value for all rows
df = df.drop(columns=['Residencia: Nivel 1'])



In [None]:
# Define full province names
target_provinces = [
    "29 Málaga",
    "41 Sevilla",
    "07 Balears, Illes",
    "08 Barcelona",
    "17 Girona",
    "46 Valencia/València",
    "28 Madrid"
]

# Clean 'Provincias' column
df['Provincias'] = df['Provincias'].astype(str).str.strip()

# Apply both filters sequentially on df
df_filtered = df[
    df['Provincias'].isin(target_provinces) &
    (df['Periodo'].str[:4].astype(int) >= 2022)
].copy()


In [None]:
df_filtered

In [None]:
# Separate into travelers and overnight stays
df_travelers = df_filtered[df_filtered['Viajeros y pernoctaciones'] == 'Viajero'].copy()
df_stays = df_filtered[df_filtered['Viajeros y pernoctaciones'] == 'Pernoctaciones'].copy()

# Rename 'Total' to reflect its meaning in each DataFrame
df_travelers.rename(columns={'Total': 'Total Travelers'}, inplace=True)
df_stays.rename(columns={'Total': 'Total Overnight Stays'}, inplace=True)

for d in [df_travelers, df_stays]:
    d['Year'] = d['Periodo'].str[:4].astype(int)
    d['Month'] = d['Periodo'].str[5:].astype(int)


In [None]:
df_travelers.info()

In [None]:
df_stays.info()

In [None]:

monthly_trend = df_travelers.groupby(['Provincias', 'Year', 'Month'])['Total Travelers'].sum().reset_index()
print(monthly_trend.head())


In [None]:
# Merge datasets on common keys (Region, Year, Month, Traveler Origin)
df_merged = pd.merge(
    df_travelers, 
    df_stays, 
    on=['Comunidades y Ciudades Autónomas', 'Provincias', 'Traveler Origin', 'Year', 'Month']
)

# Calculate average stay duration
df_merged['Avg Stay Duration'] = df_merged['Total Overnight Stays'] / df_merged['Total Travelers']

# Create final DataFrame by copying the selected columns
df_final = df_merged.loc[:, [
    'Comunidades y Ciudades Autónomas', 
    'Provincias', 
    'Traveler Origin', 
    'Year', 
    'Month', 
    'Total Travelers', 
    'Total Overnight Stays', 
    'Avg Stay Duration'
]].copy()  # Using .copy() ensures we're working with a new DataFrame

# Convert Month to categorical for better visualization
month_map = {
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
}
df_final['Month'] = df_final['Month'].map(month_map)

In [None]:
origin_dist = df_final.groupby('Traveler Origin')['Total Travelers'].sum().reset_index()

plt.figure(figsize=(8, 5))
sns.barplot(data=origin_dist, x='Traveler Origin', y='Total Travelers', palette='viridis')
plt.title("Traveler Distribution by Origin (Domestic vs. International)")
plt.ylabel("Total Travelers (Millions)")
plt.xlabel("Origin")
plt.show()

In [None]:
regional_stays = df_final.groupby('Comunidades y Ciudades Autónomas')['Total Overnight Stays'].sum()
regional_avg_stay = df_final.groupby('Comunidades y Ciudades Autónomas')['Avg Stay Duration'].mean()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
sns.barplot(y=regional_stays.index, x=regional_stays.values, ax=ax1, palette='mako')
ax1.set_title("Regions by Total Overnight Stays")
ax1.set_xlabel("Total Overnight Stays (Millions)")

sns.barplot(y=regional_avg_stay.index, x=regional_avg_stay.values, ax=ax2, palette='rocket')
ax2.set_title("Regions by Average Stay Duration (Nights)")
ax2.set_xlabel("Avg Stay Duration (Nights)")
plt.tight_layout()
plt.show()

In [None]:
cross_analysis = df_final.pivot_table(
    index='Comunidades y Ciudades Autónomas',
    columns='Traveler Origin',
    values='Avg Stay Duration',
    aggfunc='mean'
)

plt.figure(figsize=(12, 6))
sns.heatmap(cross_analysis, cmap='YlOrRd', annot=True, fmt=".1f")
plt.title("Average Stay Duration (Nights) by Region & Traveler Origin")
plt.show()