<h2><center> MACHINE LEARNING PROJECT - N. 6 </center></h2>

<h3><center> Subscript nr. 3 - Analysis </center></h3>

This notebook contains functions to create regression models and scatterplot of features in the dataset.

### Linear Regression of Temperature and ln(Cases)

Both <i>statsmodels</i> and <i>sklearn</i> libraries are used because each has some interesting features.

Remember: $ {\ln(0.5)} \cong -0.69 $

In order to get the estimate of intercept with <i>statsmodels</i>, a column of ones has to be added to the matrix of regressors.

In [1]:
def linearreg_temperature(new_df, dict_font):
    x = new_df["TempCels"].copy()
    x = np.array(x).reshape((-1,1)) # - 1 means that numpy infers the remaining dimension (50 rows)
    
    y = new_df["Cases"].copy()
    y.loc[y == 0] = 0.5
    y = np.array( np.log(y) )

    model = LinearRegression().fit(x, y)
    print("R square: " + str( round(model.score(x, y), 5)) )
    print("Intercept: " + str( round(model.intercept_, 5)) )
    print("Slope: " + str( np.round(model.coef_[0], 5)) )

    x2 = sm.add_constant(x) # the regressor matrix should have a column of ones to estimate intercept
    model2 = sm.OLS(y, x2).fit()
    b0, b1 = model2.params
    # model2.summary()

    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = dict_font["fig_size"]

    fig = plt.figure()
    ax = fig.add_subplot()
    plt.plot(x, y, 'o', color='dodgerblue', markersize = 4)
    plt.title('$\ln (TotalCase)$ and Temperature', fontsize = dict_font["font_title"])
    plt.xlabel('Temperature (Celsius)', fontsize = dict_font["font_xy"])
    plt.ylabel('$\ln (TotalCase)$', fontsize = dict_font["font_xy"])
    plt.xticks(fontsize = dict_font["font_txt"])
    plt.yticks(fontsize = dict_font["font_txt"])
    ax.text(-10, 10, "F-Stat p-value: " + str(round( model2.f_pvalue , 5 )), fontsize = dict_font["font_txt"])
    ax.text(-10, 11, "Adj. R square: " + str(round( model2.rsquared_adj , 5 )), fontsize = dict_font["font_txt"])
    
    x_vec = np.linspace(min(x), max(x), 10000)
    plt.plot(x_vec, b0 + b1*x_vec, color='red', linestyle='-', linewidth=1.5) 

    return plt.show()

### Linear Regression of Relative Humidity and ln(Cases)

In [2]:
def linearreg_hum(new_df, dict_font):
    x = new_df["RelHum"].copy()
    x = np.array(x).reshape((-1,1)) # - 1 means that numpy infers the remaining dimension (50 rows)

    y = new_df["Cases"].copy()
    y.loc[y == 0] = 0.5
    y = np.array( np.log(y) )

    model = LinearRegression().fit(x, y)
    print("R square: " + str( round(model.score(x, y), 5)) )
    print("Intercept: " + str( round(model.intercept_, 5)) )
    print("Slope: " + str( np.round(model.coef_[0], 5)) )

    x2 = sm.add_constant(x) # the regressor matrix should have a column of ones, otherwise the intercept isn't estimated
    model2 = sm.OLS(y, x2).fit()
    b0, b1 = model2.params
    # model2.summary()

    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = dict_font["fig_size"]

    fig = plt.figure()
    ax = fig.add_subplot()
    plt.plot(x, y, 'o', color = 'dodgerblue', markersize = 4)
    plt.title('$\ln (TotalCase)$ and Relative Humidity', fontsize = dict_font["font_title"])
    plt.xlabel('Relative Humidity (%)', fontsize = dict_font["font_xy"])
    plt.ylabel(r'$\ln (TotalCase)$', fontsize = dict_font["font_xy"])
    plt.xticks(fontsize = dict_font["font_txt"])
    plt.yticks(fontsize = dict_font["font_txt"])
    ax.text(25, 10, "F-Stat p-value: " + str(round( model2.f_pvalue , 5 )), fontsize = dict_font["font_txt"])
    ax.text(25, 11, "Adj. R square: " + str(round( model2.rsquared_adj , 5 )), fontsize = dict_font["font_txt"])

    x_vec = np.linspace(min(x), max(x), 10000)
    plt.plot(x_vec, b0 + b1*x_vec, color='red', linestyle='-', linewidth=1.5)
    
    return plt.show()

### Plots of Substantial and Temperature

Substantial is a dichotomous that assumes value 1 if the observation has register more than 10 deaths due to Covid-19. 

In [3]:
def plot_sub(new_df, dict_font, scale = 0.65):
    x = new_df["Substantial"].copy()
    x = np.array(x).reshape((-1,1))
    
    y = new_df["TempCels"].copy()
    y = np.array(y)

    model = LinearRegression().fit(x, y)
    print("R square: " + str( round(model.score(x, y), 5)) )
    print("Intercept: " + str( round(model.intercept_, 5)) )
    print("Slope: " + str( np.round(model.coef_[0], 5)) )

    x2 = sm.add_constant(x)
    model2 = sm.OLS(y, x2).fit()
    # model2.summary()

    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"] * 1.2
    plt.rcParams['figure.figsize'] = np.array(dict_font["fig_size"]) * [1.2,0.8]

    plt.subplot(1, 2, 1)
    boxp = sns.boxplot(x = "Substantial", y = "TempCels", hue = "Substantial", 
                       data = new_df, palette = "muted")
    plt.title("Temperature and Substantial Transmision", fontsize = dict_font["font_title"] * scale)
    plt.xlabel('Substantial Transmission', fontsize = dict_font["font_xy"] * scale)
    plt.ylabel('Temperature', fontsize = dict_font["font_xy"] * scale)
    msg = "F-Stat pvalue: " + str(round( model2.f_pvalue , 3 )) + "\n" + "R Square: " + str(round( model2.rsquared , 3 ))
    boxp.text(0.75, 27, msg, horizontalalignment = 'left', color='black', fontsize = dict_font["font_txt"] * scale)

    leg = plt.legend(title = 'Transmission', loc = 'upper right', 
                     labels = ['Non Substansial', 'Substantial'])
    # leg.legendHandles[0].set_color('orange')
    # leg.legendHandles[1].set_color('blue')
    leg.remove() 
    plt.xticks(ticks = [0, 1], labels = ["Non Substantial", "Substantial"], fontsize = dict_font["font_xy"] * scale)
    plt.xlabel("")

    plt.subplot(1, 2, 2)
    stp = sns.stripplot(x = "Substantial", y = "TempCels", hue = "Substantial",
                   data = new_df, palette = "muted")
    stp.set(xlabel ='Substantial Transmission',  ylabel = 'Temperature', 
             title = "Temperature and Substantial Transmision")

    msg = "F-Stat pvalue: " + str(round( model2.f_pvalue , 3 )) + "\n" + "R Square: " + str(round( model2.rsquared , 3 ))
    #stp.text(0.75, 20, msg, horizontalalignment='left', size='medium', color='black', fontsize = font_txt * scale,
    #         fontstyle = "italic", bbox = dict(boxstyle = 'square', facecolor = 'white', alpha = 0.5))

    plt.title("Temperature and Substantial Transmision", fontsize = dict_font["font_title"] * scale)
    plt.xlabel('Substantial Transmission', fontsize = dict_font["font_xy"] * scale )
    plt.ylabel('Temperature', fontsize = dict_font["font_xy"] * scale)
    legend = plt.legend(title = 'Transmission', loc = 'upper right', 
                        labels = ['Non Substansial', 'Substantial'], fontsize = dict_font["font_txt"] * scale)
    legend.legendHandles[0]._sizes = [100]
    legend.legendHandles[1]._sizes = [100]
    
    return plt.show()

### Mann-Whitney tests

In [None]:
def mw(var, sub_df, nonsub_df, opts, dict_font, scale = 0.8):
    if var not in opts["Var"].to_list():
        print("Please insert one var in: ", ', '.join(opts["Var"].to_list()))
    else:
        df_nonpar = pd.DataFrame({
                "test" : ["Shapiro Substantial", "Shapiro Non Substantial", "Mann-Whitney"],
                "pvalue" : [stats.shapiro(sub_df[var])[1], # second element is p-value
                            stats.shapiro(nonsub_df[var])[1],
                            stats.mannwhitneyu(x = sub_df[var], y = nonsub_df[var], 
                                                          alternative = 'two-sided')[1]] })
        # print(df_nonpar)

        colors = list(Color("red").range_to(Color("green"),100))
        colors = [str(col) for col in colors]

        pval1 = df_nonpar["pvalue"].loc[df_nonpar["test"] == "Shapiro Substantial"]
        pval2 = df_nonpar["pvalue"].loc[df_nonpar["test"] == "Shapiro Non Substantial"]
        col1 = colors[int(pval1*100)]
        col2 = colors[int(pval2*100)]

        mu1, std1 = stats.norm.fit(np.array(sub_df[var]))
        mu2, std2 = stats.norm.fit(np.array(nonsub_df[var]))

        #plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
        #plt.rcParams['figure.figsize'] = np.array(dict_font["fig_size"])*[1.1,0.8]
        fig, (ax1, ax2) = plt.subplots(1, 2)

        my_lab = opts["Label"].loc[opts["Var"] == var].to_list()[0]
        fig.suptitle('Frequency histogram of ' + my_lab, fontsize = dict_font["font_title"], color = "black")
        ax1.hist(sub_df[var], bins = len(sub_df[var]), histtype = 'bar', edgecolor = 'black',
        color = col1, density = True)
        ax2.hist(nonsub_df[var], bins = 20, histtype='bar', edgecolor = 'black', color = col2, density = True)

        xmin1, xmax1 = np.amin(np.array(sub_df[var])), np.amax(np.array(sub_df[var]))
        x_vec1 = np.linspace(xmin1, xmax1, 250)
        probs1 = stats.norm.pdf(x_vec1, mu1, std1)
        xmin2, xmax2 = np.amin(np.array(nonsub_df[var])), np.amax(np.array(nonsub_df[var]))
        x_vec2 = np.linspace(xmin2, xmax2, 250)
        probs2 = stats.norm.pdf(x_vec2, mu2, std2)
        ax1.plot(x_vec1, probs1, linewidth = 2, c = "blue", linestyle='dashed')
        ax2.plot(x_vec2, probs2, linewidth = 2, c = "blue", linestyle='dashed')

        my_lab2 = my_lab + " (" + opts["Unit"].loc[opts["Var"] == var].to_list()[0] + ")"
        ax1.set_xlabel(my_lab2, fontsize = dict_font["font_xy"])
        ax2.set_xlabel(my_lab2, fontsize = dict_font["font_xy"])
        ax1.set_title('Substantial Transmission', fontsize = dict_font["font_title"] * scale)
        ax2.set_title('Non-Substantial Transmission', fontsize = dict_font["font_title"] * scale)
        ax1.tick_params(axis = "both", labelsize = dict_font["font_xy"] * scale)
        ax2.tick_params(axis = "both", labelsize = dict_font["font_xy"] * scale)

        return df_nonpar, plt.show()

The daframe <i>df_nonpar</i> contains result of non parametric tests: Shapiro and Mann-Whitney.

Colors of histogram reflects the Shapiro's p-value.

### Scatter plot of Temperature and Relative Humidity 

Sizes are based on quartiles of Cases.

Color is based on Substantial's value.

In [7]:
def scatter_temperature_relhum(new_df, dict_font, scale = 0.9):
    x = new_df["TempCels"].copy()
    x = np.array(x).reshape((-1,1))
    y = new_df["RelHum"].copy() 
    y = np.array(y).reshape((-1,1))

    msg = "The dimension of scatter point represent the quartile of cases in which lies the observation. \nNote that all countries with more than 10 deaths lies in fourth quartile."
    print(msg)

    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = np.array(dict_font["fig_size"])*[1.1,0.8]

    fig, ax = plt.subplots()
    sct1 = plt.scatter( x[new_df["Substantial"]==1], y[new_df["Substantial"]==1],
                        s = new_df["Size"].loc[new_df["Substantial"]==1], linewidth = 2,
    label = "Substantial Transmission", facecolors = 'None', edgecolor = "coral" )
    sct2 = plt.scatter( x[new_df["Substantial"]==0], y[new_df["Substantial"]==0],
                        s = new_df["Size"].loc[new_df["Substantial"]==0], linewidth = 2,
    label = "Non-Substantial Transmission", facecolors = 'None', edgecolor = "tab:cyan")

    legend = ax.legend(loc='lower left', shadow=True, fontsize='medium', prop = {"size": dict_font["font_xy"] * scale})
    legend.legendHandles[0]._sizes = [200]
    legend.legendHandles[1]._sizes = [200]
    plt.title('Temperature vs Relative Humidity by groups', fontsize = dict_font["font_title"])
    plt.xlabel('Temperature (Celsius)', fontsize = dict_font["font_xy"])
    plt.ylabel('Relative Humidyty (%)', fontsize = dict_font["font_xy"])
    plt.xticks(fontsize = dict_font["font_txt"] * scale)
    plt.yticks(fontsize = dict_font["font_txt"] * scale)
    
    return plt.show()

### Scatter plot of Temperature and Specific Humidity

Size are based on orders of magnitudes.

In [8]:
def scatter_temperature_spechum(new_df, dict_font, scale = 1, scale_dim = 5):
    x = new_df["TempCels"].copy()
    x = np.array(x).reshape((-1,1))
    y = new_df["SpecHum"].copy()
    y = np.array(y).reshape((-1,1))
    sizes = new_df["Size"].unique()
    colors = list(Color("green").range_to(Color("red"),len(sizes)))
    colors = [str(col) for col in colors]
    labels = ["0", "1-9", "10-99", "100-999", "1000-9999", "10000-99999"]
    fig, ax = plt.subplots()
    
    for i in range(len(sizes)):
        # need to iterate on index to access the color
        x1 = x[(new_df["Size"]==sizes[i]) & (new_df["Substantial"]==0)]
        y1 = y[(new_df["Size"]==sizes[i]) & (new_df["Substantial"]==0)]
        x2 = x[(new_df["Size"]==sizes[i]) & (new_df["Substantial"]==1)]
        y2 = y[(new_df["Size"]==sizes[i]) & (new_df["Substantial"]==1)]
        sct = plt.scatter( x1, y1,
            s = scale_dim * np.array(new_df["Size"][(new_df["Size"]==sizes[i]) & (new_df["Substantial"]==0)]),
            linewidth = 1, label = labels[i], facecolors = 'None', edgecolor = colors[i], marker = "o")
        sct = plt.scatter( x2, y2,
            s = scale_dim * np.array(new_df["Size"][(new_df["Size"]==sizes[i]) & (new_df["Substantial"]==1)]),
            linewidth = 2, facecolors = 'None', edgecolor = colors[i], marker = "s")
    
    legend = ax.legend(loc ='upper left', shadow = False, fontsize = 'medium', 
                       prop = {"size": dict_font["font_xy"] * scale})
    legend.set_title(title = 'Covid-19 Cases', prop = {'size': dict_font["font_title"] * scale})
    
    for i in range(len(sizes)):
        legend.legendHandles[i]._sizes = [(i+1)*50]
        
    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = dict_font["fig_size"]
    plt.title('Temperature vs Specific Humidity scatter-plot', fontsize = dict_font["font_title"])
    plt.xlabel('Temperature (Celsius)', fontsize = dict_font["font_xy"])
    plt.ylabel('Mean Specific Humidity', fontsize = dict_font["font_xy"])
    plt.xticks(fontsize = dict_font["font_txt"])
    plt.yticks(fontsize = dict_font["font_txt"])
    #plt.text(4, 0.17, "Square points mean substantial \ntransmission (i.e. deaths >= 10)", fontsize = dict_font["font_txt"], alpha = 0.75, c = "steelblue")
    plt.text(0.2, 0.17, "â–¡: substantial transmission \nO: non-substantial", fontsize = dict_font["font_txt"], alpha = 0.95, c = "black")
    return plt.show()


### World Map Creation

The <i>folium</i> library is used. 

For each city with substantial transmission, a pointer is defined with size and color depending on cases and temperatures.

In [9]:
def create_world(geo_info, new_df):
    data_world = pd.merge(geo_info[["lat","lon","City"]], new_df[["City","TempCels", "Death", "Substantial"]], 
                      on = ["City"], how = "inner")
    data_world = data_world[data_world["Substantial"] == 1]
    data_world = data_world.rename( columns = { "lat" : "Lat", "lon": "Lon"} )

    data_world["Lat"] = pd.to_numeric(data_world["Lat"])
    data_world["Lon"] = pd.to_numeric(data_world["Lon"])
    data_world["Death"] = pd.to_numeric(data_world["Death"])
    data_world["Death"] = (data_world["Death"]) * 2000 # adjusts radius of the bubble

    data_world = data_world.sort_values(by = ["TempCels"], ascending = True)
    
    colors = list(Color("blue").range_to(Color("red"), len(data_world["City"])))
    colors = [str(col) for col in colors]
    world_map = folium.Map(location = [30,0], zoom_start = 1.3) 

    for i in range(data_world.shape[0]):
       folium.Circle(
         location = [data_world.iloc[i]['Lat'], data_world.iloc[i]['Lon']],
         popup = data_world.iloc[i]['City'],
         radius = float(data_world.iloc[i]['Death']), # need a number, not a dataframe
         color = colors[i],
         fill = True,
         fill_color = colors[i] ).add_to(world_map)
       folium.Marker(location = [data_world.iloc[i]['Lat'], data_world.iloc[i]['Lon']], 
                     popup = data_world.iloc[i]['City']).add_to(world_map) 

    return world_map