#### Define the relevant weather variables

In [1]:
weather_vaiables = {"precipitation": "RR", 
            "temperature_mean": "TG", 
            "temperature_max": "TX",
            "temperature_min": "TN",
            "sunshine_duration": "SS",
            "cloud_cover": "CC",
            "wind_speed": "FG",
            "humidity": "HU",
            "pressure": "PP",
            "snow_depth": "SD",
            
}

In [2]:
import pandas as pd

# load the sales dataframe created in the previous load_salesdata notebook
sales = pd.read_csv("processed_data/sales_suedhang_2021_2022.csv", parse_dates=True)
sales["date"] = pd.to_datetime(sales["date"])
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      695 non-null    datetime64[ns]
 1   day_of_week               695 non-null    object        
 2   month                     695 non-null    object        
 3   year                      695 non-null    int64         
 4   week_of_year              695 non-null    int64         
 5   Getränke_sales            695 non-null    float64       
 6   Getränke_count            695 non-null    float64       
 7   Speisen_sales             695 non-null    float64       
 8   Speisen_count             695 non-null    int64         
 9   Sonstiges_sales           695 non-null    float64       
 10  Sonstiges_count           695 non-null    int64         
 11  Milchmischgetränke_sales  695 non-null    float64       
 12  Milchmischgetränke_cou

In [3]:
# create a copy of sales 
sales_and_weather = sales.copy()
# do the same for all variables and join them into one dataframe by date
for variable, code in weather_vaiables.items():
    # load the weatherdata from the raw_weatherdata folder
    weatherdata_variable = pd.read_csv(f"raw_weatherdata\{code}_STAID002763.txt", sep=",", skiprows=19, parse_dates=True)
    
    # clean the data, drop unnecessary columns, rename columns, 
    # convert date to datetime format, drop all dates before 2020-01-01, 
    # drop all rows with missing values and drop the quality column
    weatherdata_variable.columns = weatherdata_variable.columns.str.strip()
    weatherdata_variable.drop(columns=["SOUID"], inplace=True)
    weatherdata_variable.rename(columns={"DATE": "date"}, inplace=True)
    weatherdata_variable["date"] = pd.to_datetime(weatherdata_variable["date"], format="%Y%m%d")
    weatherdata_variable = weatherdata_variable[weatherdata_variable["date"] >= "2020-01-01"]
    weatherdata_variable.dropna(inplace=True)
    weatherdata_variable = weatherdata_variable[weatherdata_variable[f"Q_{code}"] == 0]
    weatherdata_variable = weatherdata_variable[["date", code]]

    # merge the weatherdata into the sales dataframe
    sales_and_weather = sales_and_weather.merge(weatherdata_variable, on="date", how="left")
    sales_and_weather.rename(columns={code: variable}, inplace=True)

sales_and_weather.to_csv("processed_data/sales_and_weather.csv", index=False)

In [4]:
sales_and_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 695 entries, 0 to 694
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      695 non-null    datetime64[ns]
 1   day_of_week               695 non-null    object        
 2   month                     695 non-null    object        
 3   year                      695 non-null    int64         
 4   week_of_year              695 non-null    int64         
 5   Getränke_sales            695 non-null    float64       
 6   Getränke_count            695 non-null    float64       
 7   Speisen_sales             695 non-null    float64       
 8   Speisen_count             695 non-null    int64         
 9   Sonstiges_sales           695 non-null    float64       
 10  Sonstiges_count           695 non-null    int64         
 11  Milchmischgetränke_sales  695 non-null    float64       
 12  Milchmischgetränke_cou

In [6]:
# drop all rows with missing values
sales_and_weather = sales_and_weather.dropna()
sales_and_weather.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 458
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      334 non-null    datetime64[ns]
 1   day_of_week               334 non-null    object        
 2   month                     334 non-null    object        
 3   year                      334 non-null    int64         
 4   week_of_year              334 non-null    int64         
 5   Getränke_sales            334 non-null    float64       
 6   Getränke_count            334 non-null    int64         
 7   Speisen_sales             334 non-null    float64       
 8   Speisen_count             334 non-null    int64         
 9   Sonstiges_sales           334 non-null    float64       
 10  Sonstiges_count           334 non-null    int64         
 11  Milchmischgetränke_sales  334 non-null    int64         
 12  Milchmischgetränke_cou