# 04: Load the combined dataframe, and do some plotting 

In [None]:
# import relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import scipy.stats

# change the font for the whole plot
plt.rcParams["font.family"] = "Times New Roman"

# Read in the data
df = pd.read_csv("processed_data/sales_and_weather_2021-2022.csv")
df_cold = pd.read_csv("processed_data/sales_and_weather_2021-2022_ONLY_COLD.csv")
df_warm = pd.read_csv("processed_data/sales_and_weather_2021-2022_ONLY_WARM.csv")

# Drop the null values from the variables
df = df.dropna(subset=["temperature_mean", "temperature_min", "temperature_max", "precipitation"])
df_cold = df_cold.dropna(subset=["temperature_mean", "temperature_min", "temperature_max", "precipitation"])
df_warm = df_warm.dropna(subset=["temperature_mean", "temperature_min", "temperature_max", "precipitation"])
df


In [None]:
# set parameters for the plots

general_color = "cornflowerblue"
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
day_colors = ["midnightblue", "mediumblue", "royalblue", "lightskyblue", "aquamarine", "palegreen", "mediumseagreen"]

axes_fontsize = 16
title_fontsize = 24
label_fontsize = 20
legend_fontsize = 16

page_width = 16 # inches


## 1. Plot the entire sales and weather data on whole date range

In [None]:
import matplotlib.dates as mdates

fig, ax = plt.subplots(figsize=(16, 3))

# Plot the max temperature and the sales for each day of the week
lns1 = ax.plot(df["date"].astype("datetime64"), df["Coffee_sales"].astype("float"), color="cornflowerblue", label="Coffee Sales")
ax2 = ax.twinx()
lns2 = ax2.plot(df["date"].astype("datetime64"), df["temperature_max"].astype("float"), color="darkorange", label="Max. Temperature")
ax.set_ylim(0, 2200)
ax2.set_ylim(-5, 38)
ax.set_yticks(np.arange(0, 2500, 1000))
ax2.set_yticks(np.arange(0, 40, 10))
ax.set_xlim(pd.Timestamp('2021-01-01'), pd.Timestamp('2022-10-20'))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3, ))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.tick_params(axis="both", which="major", labelsize=axes_fontsize)
ax2.tick_params(axis="both", which="major", labelsize=axes_fontsize)
ax.set_xlabel("Date", fontsize=label_fontsize)
ax.set_ylabel("Coffee Sales (€)", fontsize=label_fontsize)
ax2.set_ylabel("Max. Temperature (°C)", fontsize=label_fontsize)
ax.spines["top"].set_visible(False)
ax2.spines["top"].set_visible(False)
lns = lns1+lns2
labs = [l.get_label() for l in lns]
ax.legend(lns, labs, loc="upper left", fontsize=legend_fontsize)
plt.tight_layout()
plt.savefig("results/coffee_sales.pdf", dpi=300, bbox_inches="tight")
plt.show()

## 2. Plot general correlations

In [None]:
sales_variable = "Coffee_sales"

# plot the correlation between sales and max temperature
fig, axes = plt.subplots(1, 3, figsize=(page_width, 4), sharey=True)
weather_variable_1 = "temperature_max"
axes[0].scatter(df[weather_variable_1], df[sales_variable], s=20,  marker="o", color=general_color)
axes[0].set_xlabel("Maximum Temperature (°C)", fontsize=label_fontsize)
axes[0].set_ylabel("Coffee Sales (€)",         fontsize=label_fontsize)
axes[0].set_ylim(0, 1800)
axes[0].set_yticks(np.arange(0, 2500, 1000))
axes[0].set_xlim(-7, 37)
axes[0].set_xticks(np.arange(0, 40, 10))
X = df[weather_variable_1].values.reshape(-1, 1)
y = df[sales_variable].values.reshape(-1, 1)
reg = LinearRegression().fit(X, y)
axes[0].plot(X, reg.predict(X), color="black")
corr = scipy.stats.pearsonr(df[sales_variable], df[weather_variable_1])[0]
pval = scipy.stats.pearsonr(df[sales_variable], df[weather_variable_1])[1]

# plot the correlation between sales and sunshine duration
weather_variable_2 = "sunshine_duration"
axes[1].scatter(df[weather_variable_2], df[sales_variable], s=20,  marker="o", color=general_color)
axes[1].set_xlabel("Sunshine Duration (h)", fontsize=label_fontsize)
axes[1].set_xlim(-1, 17)
axes[1].set_xticks(np.arange(0, 20, 5))
X = df[weather_variable_2].values.reshape(-1, 1)
reg = LinearRegression().fit(X, y)
axes[1].plot(X, reg.predict(X), color="black")
corr = scipy.stats.pearsonr(df[sales_variable], df[weather_variable_2])[0]
pval = scipy.stats.pearsonr(df[sales_variable], df[weather_variable_2])[1]

# plot the correlation between sales and precipitation
weather_variable_3 = "precipitation"
axes[2].scatter(df[weather_variable_3], df[sales_variable], s=20,  marker="o", color=general_color)
axes[2].set_xlabel("Precipitation (mm/m²/day)", fontsize=label_fontsize)
axes[2].set_xlim(-2, 35)
axes[2].set_xticks(np.arange(0, 40, 10))
X = df[weather_variable_3].values.reshape(-1, 1)
reg = LinearRegression().fit(X, y)
axes[2].plot(X, reg.predict(X), color="black")
corr = scipy.stats.pearsonr(df[sales_variable], df[weather_variable_3])[0]
pval = scipy.stats.pearsonr(df[sales_variable], df[weather_variable_3])[1]

for ax in axes:
    ax.tick_params(axis="both", which="major", labelsize=axes_fontsize)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

fig.suptitle("Correlation between Coffee Sales and Weather Variables", fontsize=title_fontsize)
plt.tight_layout()
plt.savefig("results/temp_sun_precip.pdf", bbox_inches="tight", dpi=300)
plt.show()

In [None]:
# calculate the correlation between sales and mean temperature

#corr = df["Getränke_sales"].corr(df["temperature_max"])
corr = scipy.stats.pearsonr(df["Coffee_sales"], df["temperature_max"])[0]
pval = scipy.stats.pearsonr(df["Coffee_sales"], df["temperature_max"])[1]
print("Correlation between overall sales and max temperature is about: {:.2f}".format(corr) + f". The correlation has a p-value of: {pval}")

#corr = df["Getränke_sales"].corr(df["temperature_max"])
corr = scipy.stats.pearsonr(df["Coffee_sales"], df["sunshine_duration"])[0]
pval = scipy.stats.pearsonr(df["Coffee_sales"], df["sunshine_duration"])[1]
print("Correlation between overall sales and sunshine duration is about: {:.2f}".format(corr) + f". The correlation has a p-value of: {pval}")

#corr = df["Getränke_sales"].corr(df["temperature_max"])
corr = scipy.stats.pearsonr(df["Coffee_sales"], df["precipitation"])[0]
pval = scipy.stats.pearsonr(df["Coffee_sales"], df["precipitation"])[1]
print("Correlation between overall sales and precipitation is about: {:.2f}".format(corr) + f". The correlation has a p-value of: {pval}")




# 3. Plot average coffee sales per weekday

In [None]:
weather_variable = "temperature_max"
sales_variable = "Coffee_sales"

# Plot the average coffee sales per weekday in a bar plot with error bars.
fig, ax = plt.subplots(1, 1, figsize=(page_width*0.3, 4), sharey=True, sharex=True)
fig.supylabel("average Coffee Sales (€)", fontsize=label_fontsize, x=0.050, y=0.58)
width = 0.4
for i, day in enumerate(days):
    ax.bar(i+1-0.5*width, df[(df["day_of_week"] == day) & (df["year"] == 2021)][sales_variable].astype("float").mean(), label=day, color=day_colors[i], width=width, alpha=0.4, )
    ax.bar(i+1+0.5*width, df[(df["day_of_week"] == day) & (df["year"] == 2022)][sales_variable].astype("float").mean(), label=day, color=day_colors[i], width=width)
    ax.errorbar(i+1-0.5*width, df[(df["day_of_week"] == day) & (df["year"] == 2021)][sales_variable].astype("float").mean(), yerr=df[(df["day_of_week"] == day) & (df["year"] == 2021)][sales_variable].astype("float").std(), fmt="none", color=day_colors[i], capsize=3)
    ax.errorbar(i+1+0.5*width, df[(df["day_of_week"] == day) & (df["year"] == 2022)][sales_variable].astype("float").mean(), yerr=df[(df["day_of_week"] == day) & (df["year"] == 2022)][sales_variable].astype("float").std(), fmt="none", color=day_colors[i], capsize=3)
    ax.errorbar(i+1-0.5*width, df[(df["day_of_week"] == day) & (df["year"] == 2021)][sales_variable].astype("float").mean(), yerr=df[(df["day_of_week"] == day) & (df["year"] == 2021)][sales_variable].astype("float").std(), fmt="none", color="black", capsize=3, alpha=0.6, label = "2021")
    ax.errorbar(i+1+0.5*width, df[(df["day_of_week"] == day) & (df["year"] == 2022)][sales_variable].astype("float").mean(), yerr=df[(df["day_of_week"] == day) & (df["year"] == 2022)][sales_variable].astype("float").std(), fmt="none", color="black", capsize=3, alpha=0.6)
    for tick in ax.get_xticklabels():
        tick.set_rotation(30)
        tick.set_horizontalalignment("right")
    ax.set_xticks(np.arange(1, 8))
    ax.set_xticklabels(days)
    ax.set_ylim(0, 1800)
    ax.set_yticks(np.arange(0, 1600, 500))
    ax.tick_params(axis="both", which="major", labelsize=axes_fontsize)
    
t = ["2021", "2022", "± 1 std. dev."]
handles, labels = ax.get_legend_handles_labels()
leg = ax.legend(handles[:3], t, fontsize=legend_fontsize)
leg.legendHandles[0].set_color('grey')
leg.legendHandles[1].set_color('dimgrey')
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
plt.tight_layout()
plt.savefig("results/mean_sales_by_day_of_week.pdf", dpi=300, format="pdf")
plt.show()


## 4. Plot the per day correlation between sales and maximum temperature / sunshine hours / precipitation

In [None]:
def plot_correlation(df, weather_variable, sales_variable, condition, condition_value, ax, s, marker, color=None, show_condition=False):
    df= df[df[condition] == condition_value]
    ax.scatter(df[weather_variable], df[sales_variable], color=color, s=s, marker=marker)
    X = df[weather_variable].values.reshape(-1, 1)
    y = df[sales_variable].values.reshape(-1, 1)
    reg = LinearRegression().fit(X, y)
    ax.plot(X, reg.predict(X), color="black")
    corr = scipy.stats.pearsonr(df[sales_variable], df[weather_variable])[0]
    pval = scipy.stats.pearsonr(df[sales_variable], df[weather_variable])[1]
    print("Correlation between sales and " + weather_variable + " on " + condition_value + " is about: {:.2f}".format(corr) + f". The correlation has a p-value of: {pval}")

    ax.set_title(f"{condition_value}", fontsize=18, alpha=1.0 if show_condition else 0.0)

In [None]:
weather_variable = "temperature_max"
sales_variable = "Coffee_sales"

# Plot the correlation between the max temperature and the sales for each day of the week.
fig, axes = plt.subplots(1, 7, figsize=(page_width, 3), sharey=True, sharex=True, frameon=True)
fig.supxlabel("Maximum Temperature (°C)", fontsize=label_fontsize, x=0.5, y=0.08) 
fig.supylabel("Coffee Sales (€)",         fontsize=label_fontsize, x=0.015, y=0.57)
for i, day in enumerate(days):
    plot_correlation(df, weather_variable, sales_variable, "day_of_week", day, axes[i], 20, "o", day_colors[i], True)
    axes[i].set_ylim(0, 2200)
    axes[i].set_yticks(np.arange(0, 2500, 1000))
    axes[i].set_xlim(-7, 37)
    axes[i].set_xticks(np.arange(0, 40, 10))
    axes[i].tick_params(axis="both", which="major", labelsize=axes_fontsize)
    axes[i].spines["right"].set_visible(False)
    axes[i].spines["top"].set_visible(False)
plt.tight_layout()
plt.savefig("results/correlation_by_day_of_week.pdf", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
weather_variable = "sunshine_duration"
sales_variable = "Coffee_sales"

# Plot the correlation between sunshine hours and the sales for each day of the week.
fig, axes = plt.subplots(1, 7, figsize=(page_width, 3), sharey=True, sharex=True, frameon=True)
fig.supxlabel("Sunshine Duration (h)", fontsize=label_fontsize, x=0.5, y=0.08) 
fig.supylabel("Coffee Sales (€)",      fontsize=label_fontsize, x=0.015, y=0.57)
for i, day in enumerate(days):
    plot_correlation(df, weather_variable, sales_variable, "day_of_week", day, axes[i], 20, "o", day_colors[i])
    axes[i].set_ylim(0, 2200)
    axes[i].set_yticks(np.arange(0, 2500, 1000))
    axes[i].set_xlim(-2, 17)
    axes[i].set_xticks(np.arange(0, 20, 5))
    axes[i].tick_params(axis="both", which="major", labelsize=axes_fontsize)
    axes[i].spines["right"].set_visible(False)
    axes[i].spines["top"].set_visible(False)
plt.tight_layout()
plt.savefig("results/correlation_sunshine_by_day_of_week.pdf", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
weather_variable = "precipitation"
sales_variable = "Coffee_sales"

# Plot the correlation between the precipitation and the sales for each day of the week.
fig, axes = plt.subplots(1, 7, figsize=(page_width, 3), sharey=True, sharex=True, frameon=True)
fig.supxlabel("Precipitation (mm/m²/day)", fontsize=label_fontsize, x=0.5, y=0.08)
fig.supylabel("Coffee Sales (€)",          fontsize=label_fontsize, x=0.015, y=0.57)
for i, day in enumerate(days):
    plot_correlation(df, weather_variable, sales_variable, "day_of_week", day, axes[i], 20, "o", day_colors[i])
    axes[i].set_ylim(0, 2200)
    axes[i].set_yticks(np.arange(0, 2500, 1000))
    axes[i].set_xlim(-2, 35)
    axes[i].set_xticks(np.arange(0, 40, 10))
    axes[i].tick_params(axis="both", which="major", labelsize=axes_fontsize)
    axes[i].spines["right"].set_visible(False)
    axes[i].spines["top"].set_visible(False)
plt.tight_layout()
plt.savefig("results/correlation_precipitation_by_day_of_week.pdf", dpi=300, bbox_inches="tight")
plt.show()

## 5. Plot the correlation between sales data for cold / warm drinks and the maximum temperature

We found that cold drinks had a higher correlation with the temperature than warm ones.



In [None]:
day_colors_warm = ["#C11718", "#C9311E", "#D04526", "#D95B2F", "#E2703B", "#EB8548", "#F49A56"]
day_colors_cold = ["#1C3E8F", "#27509D", "#3163AB", "#3E77BB", "#4E8DCA", "#60A2D8", "#74B9E7"]

sales_variable = "Coffee_sales"
weather_variable = "temperature_max"

# Plot the correlation between the max temperature and sales data for warm drinks for each day of the week
fig, axes = plt.subplots(1, 7, figsize=(page_width, 3), sharey=True, sharex=True, frameon=True)
fig.supxlabel("Maximum Temperature (°C)", fontsize=label_fontsize, x=0.5, y=0.08) 
fig.supylabel("Coffee Sales (€)", fontsize=label_fontsize, x=0.02, y=0.6)
for i, day in enumerate(days):
    plot_correlation(df_warm, weather_variable, sales_variable, "day_of_week", day, axes[i], 20, "o", day_colors_warm[i])
    axes[i].set_ylim(0, 2200)
    axes[i].set_yticks(np.arange(0, 2500, 1000))
    axes[i].set_xlim(-7, 37)
    axes[i].set_xticks(np.arange(0, 40, 10))
    axes[i].tick_params(axis="both", which="major", labelsize=axes_fontsize)
    axes[i].spines["right"].set_visible(False)
    axes[i].spines["top"].set_visible(False)
# tight layout to avoid overlapping of the x-axis labels
# set the title of the whole figure
fig.suptitle("Correlation between the maximum temperature and the sales of warm drinks", fontsize=title_fontsize, y=0.87)
plt.tight_layout()
plt.show()

# Plot the correlation between the max temperature and sales data of cold drinks for each day of the week
fig, axes = plt.subplots(1, 7, figsize=(16, 3), sharey=True, sharex=True, frameon=True)
fig.supxlabel("Maximum Temperature (°C)", fontsize=label_fontsize, x=0.5, y=0.08) 
fig.supylabel("Coffee Sales (€)", fontsize=label_fontsize, x=0.02, y=0.6)
for i, day in enumerate(days):
    plot_correlation(df_cold, weather_variable, sales_variable, "day_of_week", day, axes[i], 20, "o", day_colors_cold[i])
    axes[i].set_ylim(0, 2200)
    axes[i].set_yticks(np.arange(0, 2500, 1000))
    axes[i].set_xlim(-7, 37)
    axes[i].set_xticks(np.arange(0, 40, 10))
    axes[i].tick_params(axis="both", which="major", labelsize=axes_fontsize)
    axes[i].spines["right"].set_visible(False)
    axes[i].spines["top"].set_visible(False)
# tight layout to avoid overlapping of the x-axis labels
# set the title of the whole figure
fig.suptitle("Correlation between the maximum temperature and the sales of cold drinks", fontsize=title_fontsize, y=0.87)
plt.tight_layout()
plt.show()
