In [None]:
import requests
from pathlib import Path
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

In [None]:
# Download the Excel file containing the data
url = "https://www.istat.it/wp-content/uploads/2024/10/Il_1522_dati_provinciali_settimanali_2013_III_trim_2024.xlsx"
output_path='../raw_data/dati_provinciali_settimanali.xlsx'
response = requests.get(url, verify=True)
Path(output_path).write_bytes(response.content)

In [None]:
# Extract csv file with relevant information
df = pd.read_excel('../raw_data/dati_provinciali_settimanali.xlsx', sheet_name='tav.2', header=0)
df.head(20)

In [None]:
data = []
current_year = None
weeks = None

for idx, row in df.iterrows():
    # Year check with debug print
    if pd.notna(row['Unnamed: 0']) and str(row['Unnamed: 0']).strip().isdigit():
        current_year = int(row['Unnamed: 0'])
        print(f"Found year: {current_year}")
        continue
        
    if pd.isna(row['Unnamed: 0']) and any(pd.notna(row[1:5])):  # Check first few cells for any numeric values
        weekly_calls = row[1:len(weeks)+1].values
        for week_idx, calls in enumerate(weekly_calls):
            if pd.notna(calls):
                data.append({
                    'year': current_year,
                    'week': weeks[week_idx],
                    'provincia': 'missing',
                    'calls': calls
                })
        continue

    # Week numbers
    if pd.notna(row['Unnamed: 0']) and row['Unnamed: 0'] == 'Numero Settimana':
        weeks = [col for col in row[1:] if pd.notna(col)]
        continue
       

    # Skip rows
    if pd.isna(row['Unnamed: 0']) or row['Unnamed: 0'] == 'Numero Settimana' or str(row['Unnamed: 0']).strip().isdigit() or 'Tavola' in str(row['Unnamed: 0']):
        continue

    # Regular province data
    provincia = row['Unnamed: 0']
    if pd.notna(provincia) and provincia != '':
        weekly_calls = row[1:len(weeks)+1].values
        for week_idx, calls in enumerate(weekly_calls):
            if pd.notna(calls):
                data.append({
                    'year': current_year,
                    'week': weeks[week_idx],
                    'provincia': provincia,
                    'calls': calls
                })

# Create DataFrame
clean_df = pd.DataFrame(data)

In [None]:
clean_df.to_csv('../raw_data/calls_raw.csv', index=False, encoding='UTF-8-sig')