In [None]:
import requests

from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, datetime
from astral import LocationInfo
from astral.sun import daylight

In [None]:
url_first_day = 'https://prevision-meteo.ch/climat/horaire/paris-montsouris/2020-09-01'
html_first_day = requests.get(url_first_day).text

soup_first_day = BeautifulSoup(html_first_day, 'html.parser')

data_first_day = soup_first_day.find_all('div', {'class': 'table-responsive'})

df_first_day = pd.read_html(str(data_first_day))[0]

df_first_day = df_first_day.droplevel(level=0, axis=1)

df_first_day['date_datetime'] = df_first_day['Heure UTC1'].map(lambda x: datetime(2020, 9, 1, int(x[:2])))
df_first_day.drop(columns=['Heure UTC1'], inplace=True)

df_all_days = df_first_day.copy()

years = [2020, 2021]
months = {2020 : [9, 10, 11, 12], 2021 : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
days = {2020 : {
    9 : list(range(2, 31)),
    10 : list(range(1, 32)),
    11 : list(range(1, 31)),
    12 : list(range(1, 32))
},
2021 : {
    1 : list(range(1, 32)),
    2 : list(range(1, 29)),
    3 : list(range(1, 32)),
    4 : list(range(1, 31)),
    5 : list(range(1, 32)),
    6 : list(range(1, 31)),
    7 : list(range(1, 32)),
    8 : list(range(1, 32)),
    9 : list(range(1, 31)),
    10 : list(range(1, 22))

}}

for year in years:
    for month in months[year]:
        for day in days[year][month]:
            print(f'Getting data for {day}-{month}-{year}')

            year_str = str(year)
            if len(str(month)) == 1:
                month_str = '0' + str(month)
            else:
                month_str = str(month)
            if len(str(day)) == 1:
                day_str = '0' + str(day)
            else :
                day_str=str(day)

            if not ((year == 2020) & (month == 11) & (day in [9, 10, 11])):
                url = f'https://prevision-meteo.ch/climat/horaire/paris-montsouris/{year_str}-{month_str}-{day_str}'
                html = requests.get(url).text

                soup = BeautifulSoup(html, 'html.parser')

                data = soup.find_all('div', {'class': 'table-responsive'})

                df = pd.read_html(str(data))[0]

                df = df.droplevel(level=0, axis=1)

                df['date_datetime'] = df['Heure UTC1'].map(lambda x: datetime(year, month, day, int(x[:2])))
                df.drop(columns=['Heure UTC1'], inplace=True)

                df_all_days = pd.concat([df_all_days, df])
                print(f'Successfully added data for {day}-{month}-{year}')
            else : print(f'Skipped data for {day}-{month}-{year}')

In [None]:
df_all_days.to_csv('meteo_scraped_2.csv')
df_reloaded = pd.read_csv('meteo_scraped_2.csv', parse_dates=['date_datetime'])

In [None]:
df_reloaded

In [None]:
url_first_month = 'https://prevision-meteo.ch/climat/journalier/paris-montsouris/2020-09'

html_first_month = requests.get(url_first_month).text

soup_first_month = BeautifulSoup(html_first_month, 'html.parser')

data_first_month = soup_first_month.find_all('div', {'class': 'table-responsive'})

df_first_month = pd.read_html(str(data_first_month))[0]

df_first_month = df_first_month.droplevel(level=0, axis=1)

df_first_month = df_first_month[df_first_month.Date != 'Total']

df_first_month['date_datetime'] = df_first_month.Date.apply(lambda x: date(2020, 9, int(x[-2:])))

df_all_months = df_first_month.copy()

years = [2020, 2021]

months = {2020 : [10, 11, 12], 2021 : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

for year in years:
    for month in months[year]:

        print(f'Getting data for {month}-{year}')

        year_str = str(year)
        if len(str(month)) == 1:
            month_str = '0' + str(month)
        else:
            month_str = str(month)

        url = f'https://prevision-meteo.ch/climat/journalier/paris-montsouris/{year_str}-{month_str}'

        html = requests.get(url).text

        soup = BeautifulSoup(html, 'html.parser')

        data = soup.find_all('div', {'class': 'table-responsive'})

        df = pd.read_html(str(data))[0]
        df = df.droplevel(level=0, axis=1)
        df = df[df.Date != 'Total']
        df['date_datetime'] = df.Date.apply(lambda x: date(year, month, int(x[-2:])))

        df_all_months = pd.concat([df_all_months, df])

        print (f'Successfully added data for {month}-{year}')

        

In [None]:
df_all_months

In [None]:
df_all_months.drop(columns=['Date', 'Min1', 'Max.2', 'Moy.', 'Moy.3', 'Min'], inplace=True)
df_all_months.columns = ['vent', 'soleil', 'pluie', 'to_drop', 'date_datetime']
df_all_months.reset_index(inplace=True)
df_all_months.drop(columns = ['to_drop', 'index'], inplace=True)
df_all_months['soleil'].replace({'--' : '0h 0min'}, inplace=True)
df_all_months['pluie'].replace({'--' : 0}, inplace=True)

In [None]:
def minutes_soleil(x):
    hours_and_minutes = x.split()
    hours = int(hours_and_minutes[0][:-1])
    minutes = int(hours_and_minutes[1][:-3])
    return 60*hours + minutes

In [None]:
df_all_months['temps_soleil'] = df_all_months.soleil.apply(minutes_soleil)

In [None]:
df_clean = df_reloaded.drop(columns = ['Unnamed: 0', 'Dir.2', 'Moy.', 'Pres.4 [hPa]', 'Nébu. [octa]'])
df_clean['pluie_direct'] = df_clean['Préc.5 [mm]'].apply(lambda x: x.split("/")[0])
df_clean['pluie_direct'].replace({'--' : 0, 'trace' : 0.05}, inplace=True)
df_clean['pluie_direct'] = pd.to_numeric(df_clean.pluie_direct)

In [None]:
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=3)
df_clean['pluie_last_3'] = df_clean['pluie_direct'].rolling(indexer, min_periods=1).sum()

In [None]:
df_clean['pluie_intermittente'] = df_clean['Cond.'].apply(lambda x : ('pluie' in x) & ('intermittente' in x))
df_clean['pluie_continue'] = df_clean['Cond.'].apply(lambda x : ('pluie' in x) & ('continue' in x))
df_clean['pluie_forte'] = df_clean['Cond.'].apply(lambda x : ('pluie' in x) & ('forte' in x))
df_clean['pluie_faible'] = df_clean['Cond.'].apply(lambda x : ('pluie' in x) & ('faible' in x))
df_clean['pluie_modérée'] = df_clean['Cond.'].apply(lambda x : ('pluie' in x) & ('modérée' in x))
df_clean['neige'] = df_clean['Cond.'].apply(lambda x : ('neige' in x))
df_clean['bruine'] = df_clean['Cond.'].apply(lambda x: 'bruine' in x)
df_clean['brouillard'] = df_clean['Cond.'].apply(lambda x: 'brouillard' in x)
df_clean['verglas'] = df_clean['Cond.'].apply(lambda x: 'verglaçante' in x)


In [None]:
df_clean.drop(columns=['Cond.', 'Préc.5 [mm]'], inplace=True)

In [None]:
df_clean['date'] = df_clean.date_datetime.apply(lambda x: x.date())

In [None]:
# 3 and a half days were missing, so we simply replaced the missing weather data for these days by the weather data of the previous day (November 8)

november_8 = df_clean[df_clean.date == date(2020, 11, 8)]

november_9 = november_8.copy().reset_index()
november_10 = november_8.copy().reset_index()
november_11 = november_8.copy().reset_index()
november_12 = november_8.copy().reset_index()[-10:]
november_9['date_datetime'] = november_9.date_datetime.apply(lambda x : pd.to_datetime(str(x).replace('2020-11-08', '2020-11-09')))
november_10['date_datetime'] = november_10.date_datetime.apply(lambda x : pd.to_datetime(str(x).replace('2020-11-08', '2020-11-10')))
november_11['date_datetime'] = november_11.date_datetime.apply(lambda x : pd.to_datetime(str(x).replace('2020-11-08', '2020-11-11')))
november_12['date_datetime'] = november_12.date_datetime.apply(lambda x : pd.to_datetime(str(x).replace('2020-11-08', '2020-11-12')))

df_clean_full = pd.concat([df_clean, november_9, november_10, november_11, november_12])

df_clean_full.reset_index(inplace=True)

In [None]:
def is_daylight(x):
    city=LocationInfo('Paris', timezone='Europe/Paris')
    sun_info = daylight(city.observer, date=x.to_pydatetime().date(), tzinfo='Europe/Paris')
    x = x.tz_localize('Europe/Paris', ambiguous=True, nonexistent='shift_forward')
    return (x > sun_info[0]) & (x < sun_info[1])

In [None]:
df_clean_full['is_daylight'] = df_clean_full.date_datetime.map(is_daylight)

In [None]:
df_all_months.drop(columns = ['soleil'], inplace=True)
df_all_months.set_index('date_datetime', inplace=True)

In [None]:
df_clean_full['temps_soleil'] = df_clean_full.date.apply(lambda x: df_all_months.loc[x, 'temps_soleil'])
df_clean_full['pluie_cumul_day'] = df_clean_full.date.apply(lambda x: df_all_months.loc[x, 'pluie'])
df_clean_full['vent_max'] = df_clean_full.date.apply(lambda x: df_all_months.loc[x, 'vent'])
df_clean_full.drop(columns=['level_0', 'index'], inplace=True)
df_clean_full.drop(columns=['Ros.', '2m.'], inplace=True)

In [None]:
df_clean_full.to_csv('meteo_scraped_v2.csv')

In [None]:
ext_data = pd.read_csv('external_data_reworked.csv', parse_dates=['date'])
ext_data_index = ext_data.set_index('date')

In [None]:
df_clean_index = df_clean_full.set_index('date_datetime')
df_clean_index.sort_index(inplace=True)
df_clean_index['pluie_last_3'] = df_clean_index['pluie_last_3'].round(3)

In [None]:
ext_data_index.sort_index(inplace=True)

In [None]:
merged_data = pd.merge_asof(df_clean_index, ext_data_index, left_index=True, right_index=True)

In [None]:
merged_data.iloc[:, 2].replace({"--" : 0}, inplace=True)
merged_data.iloc[:, 2] = merged_data.iloc[:, 2].astype(float)

In [None]:
mean_hum = pd.to_numeric(merged_data[merged_data.iloc[:, 3] != "--"]['Hum. [%]']).mean()
merged_data.iloc[:, 3].replace({"--" : mean_hum}, inplace=True)
merged_data.iloc[:, 3] = merged_data.iloc[:, 3].astype(float)

In [None]:
merged_data.drop(columns=['date_datetime'], inplace=True)
merged_data.reset_index(inplace=True)
merged_data.rename(columns={'date' : 'a', 'date_datetime' : 'b'}, inplace=True)

merged_data.rename(columns={'a' : 'date_datetime', 'b' : 'date'}, inplace=True)

In [None]:
merged_data.set_index('date', inplace=True)

In [None]:
merged_data.to_csv('external_data_new.csv')

In [None]:
merged_data