In [1]:
import pandas as pd

# load the sales dataframe created in the previous load_salesdata notebook
sales = pd.read_csv("processed_data/sales_2021-2022.csv", parse_dates=True)
sales["date"] = pd.to_datetime(sales["date"])
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      695 non-null    datetime64[ns]
 1   day_of_week               695 non-null    object        
 2   month                     695 non-null    object        
 3   year                      695 non-null    int64         
 4   week_of_year              695 non-null    int64         
 5   Getränke_sales            695 non-null    float64       
 6   Getränke_count            695 non-null    float64       
 7   Speisen_sales             695 non-null    float64       
 8   Speisen_count             695 non-null    int64         
 9   Sonstiges_sales           695 non-null    float64       
 10  Sonstiges_count           695 non-null    int64         
 11  Milchmischgetränke_sales  695 non-null    float64       
 12  Milchmischgetränke_cou

#### Define the relevant weather variables and load them into the existing sales dataframe

In [4]:
weather_variables = {"precipitation": "RR", 
                    "temperature_mean": "TG", 
                    "temperature_max": "TX",
                    "temperature_min": "TN",
                    "sunshine_duration": "SS",
                    "cloud_cover": "CC",
                    "wind_speed": "FG",
                    "humidity": "HU",
                    "pressure": "PP",
                    "snow_depth": "SD"}

sales_and_weather = sales.copy()

for variable, code in weather_variables.items():
    weather_var = pd.read_csv(f"raw_weatherdata\{code}_STAID002763.txt", sep=",", skiprows=19, parse_dates=True)
    
    # clean the data, drop unnecessary columns, rename columns, 
    # convert date to datetime format, drop all dates before 2020-01-01, 
    # drop all rows with missing values and drop the quality column
    weather_var.columns = weather_var.columns.str.strip()
    weather_var.drop(columns=["SOUID"], inplace=True)
    weather_var.rename(columns={"DATE": "date"}, inplace=True)
    weather_var["date"] = pd.to_datetime(weather_var["date"], format="%Y%m%d")
    weather_var = weather_var[weather_var["date"] >= "2020-01-01"]
    weather_var.dropna(inplace=True)
    weather_var = weather_var[weather_var[f"Q_{code}"] == 0]
    weather_var = weather_var[["date", code]]

    # merge the weatherdata into the sales dataframe
    sales_and_weather = sales_and_weather.merge(weather_var, on="date", how="left")
    sales_and_weather.rename(columns={code: variable}, inplace=True)

sales_and_weather.to_csv("processed_data/sales_and_weather_2021-2022.csv", index=False)
sales_and_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 695 entries, 0 to 694
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      695 non-null    datetime64[ns]
 1   day_of_week               695 non-null    object        
 2   month                     695 non-null    object        
 3   year                      695 non-null    int64         
 4   week_of_year              695 non-null    int64         
 5   Getränke_sales            695 non-null    float64       
 6   Getränke_count            695 non-null    float64       
 7   Speisen_sales             695 non-null    float64       
 8   Speisen_count             695 non-null    int64         
 9   Sonstiges_sales           695 non-null    float64       
 10  Sonstiges_count           695 non-null    int64         
 11  Milchmischgetränke_sales  695 non-null    float64       
 12  Milchmischgetränke_cou