<h2><center> MACHINE LEARNING PROJECT - N. 6 </center></h2>

<h3><center> Subscript nr. 2 - Exploratory Data Analysis </center></h3>

This notebook contains different types of analysis to explore data: correlations, histograms, variable relevance, time series.

Both <i>matplotlib</i> and <i>seaborn</i> libraries allow the creation of data visualization's tools.

### Correlation matrix

In [1]:
def corr_mat(new_df, dict_font):
    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = dict_font["fig_size"]

    sns.set(font_scale = 1.2)
    sns.set_style("white")
    
    # np.triu: upper triangle of an array
    # np.ones_like: array of ones with same shape and type
    # mask: data will not be shown in cells where mask is True
    mask = np.triu(np.ones_like(new_df.corr(), dtype=bool)) 
    sns.heatmap(new_df.corr(), vmin = -1, vmax = 1, center = 0, cmap = "PiYG", square = True, 
                mask = mask, annot = True, annot_kws = {"size" : dict_font["font_txt"]})

    return plt.show()

### Correlation plot

In [2]:
def corr_sub(new_df, dict_font):
    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = dict_font["fig_size"]

    corr_df = new_df[["Latitude", "TempCels", "SpecHum", "RelHum", "AbsHum", "Substantial"]] 

    pair = sns.pairplot(corr_df, kind = "scatter", diag_kind = "kde", hue = "Substantial", 
                        markers = ["o", "D"], palette = "Set2", corner = True)
    pair._legend.remove()
    plt.legend(['Substantial', 'Not substantial'], bbox_to_anchor=(-2.5, 5))
    return plt.show()

### Log cases histogram

In [3]:
def hist_cases(my_var, new_df, dict_font):
    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"]
    plt.rcParams['figure.figsize'] = dict_font["fig_size"]

    plt.hist(x = my_var, bins = 'auto', color = 'lightgreen', alpha = 0.8, rwidth = 0.85, density = True)
    plt.grid(axis='y', alpha = 0.2)
    plt.grid(axis='x', alpha = 0.2)
    plt.xlabel("$ \ln(Cases) $", fontsize = dict_font["font_xy"])
    plt.ylabel('Density', fontsize = dict_font["font_xy"])
    plt.xticks(fontsize = dict_font["font_txt"])
    plt.yticks(fontsize = dict_font["font_txt"])
    plt.title('Histogram of Log of Cases', fontsize = dict_font["font_title"])
    
    plt.text(7, 0.12, "mean: " + str(round( np.mean(my_var), 2 ) ), fontsize = dict_font["font_txt"])
    plt.text(7, 0.11, "std: " + str(round( np.std(my_var), 2 ) ), fontsize = dict_font["font_txt"])
    plt.text(7, 0.10, "skew of log: " + str(round( stats.skew(my_var), 2 )), 
             fontsize = dict_font["font_txt"], color = "green")
    plt.text(7, 0.09, "skew of original data: " + str(round( stats.skew(new_df["Cases"]), 2 ) ), 
             fontsize = dict_font["font_txt"], color = "red")

    mu, std = np.mean(np.array(my_var)), np.std(np.array(my_var))
    xmin, xmax = np.amin(np.array(my_var)), np.amax(np.array(my_var))
    x_values = np.linspace(xmin, xmax, 250) #return 250 evenly spaced numbers over the interval
    probs = stats.norm.pdf(x_values, mu, std) 
    plt.plot(x_values, probs, linewidth = 2, c = "blue", linestyle='dashed')

    return plt.show()

### Log deaths histogram

In [4]:
def hist_death(my_var, new_df, dict_font):
    plt.hist(x = my_var, bins = 'auto', color = 'sandybrown', alpha = 0.8, rwidth = 0.85, density = True)
    plt.grid(axis='y', alpha = 0.2)
    plt.grid(axis='x', alpha = 0.2)
    plt.xlabel("$ \ln(Death) $", fontsize = dict_font["font_xy"])
    plt.ylabel('Density', fontsize = dict_font["font_xy"])
    plt.xticks(fontsize = dict_font["font_txt"])
    plt.yticks(fontsize = dict_font["font_txt"])
    plt.title('Histogram of Log of Death', fontsize = dict_font["font_title"])
    
    plt.text(4.5, 1.7, "mean: " + str(round( np.mean(my_var), 2 ) ), fontsize = dict_font["font_txt"])
    plt.text(4.5, 1.6, "std: " + str(round( np.std(my_var), 2 ) ), fontsize = dict_font["font_txt"])
    plt.text(4.5, 1.5, "skew of log: " + str(round( stats.skew(my_var), 2 ) ), 
             fontsize = dict_font["font_txt"], color = "darkorange")
    plt.text(4.5, 1.4, "skew of original: " + str(round( stats.skew(new_df["Death"]), 2 ) ), 
             fontsize = dict_font["font_txt"], color = "red")

    prop = new_df["Death"][new_df["Death"]==0].count() / new_df["Death"].count()
    plt.arrow(x = 4.4, y = 1.3, dx= -4.5, dy = 0.45, width = .03, alpha = 0.3) 
    plt.text(4.5, 1.2, "66% of countries \nhave 0 deaths", fontsize = dict_font["font_txt"], color = "lightseagreen")

    mu, std = np.mean(np.array(my_var)), np.std(np.array(my_var))
    xmin, xmax = np.amin(np.array(my_var)), np.amax(np.array(my_var))
    x_values = np.linspace(xmin, xmax, 250)
    probs = stats.norm.pdf(x_values, mu, std)
    plt.plot(x_values, probs, linewidth=2, c = "blue", linestyle='dashed')
    
    return plt.show()

### Regression Tree without hold-out

In [5]:
def tree1(new_df, clf, dict_font):
    x_reg = new_df[["Latitude", "TempCels", "SpecHum","RelHum", "AbsHum"]]
    y = np.array(new_df["Cases"])
    clf = clf.fit(x_reg, y)
    y_hat = clf.predict(x_reg)

    print("MSE: ", round( metrics.mean_squared_error(y, y_hat) , 3))
    print("MAPE: ", round( metrics.mean_absolute_percentage_error(y, y_hat) , 3))
    print("MAE: ", round( metrics.median_absolute_error(y, y_hat) , 3))
    print("Mean and median are different (asymmetric distribution). \nThus also the error measures are significantly different.")
    print("Color of the leaf corresponds to the predicted value")

    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"] * 1.5
    fig = plt.figure(figsize = np.array(dict_font["fig_size"]) * [2,1.5])
    t = tree.plot_tree(clf, filled = True, feature_names = x_reg.columns.to_list(), 
                       rounded = True, proportion = True)
    
    return

### Regression Tree with hold-out

In [6]:
def tree2(new_df, clf, dict_font):
    X_train, X_test, y_train, y_test = train_test_split(new_df[["Latitude", "TempCels", "SpecHum", "RelHum", "AbsHum"]], 
                                                        new_df["Cases"], test_size=0.2, random_state=1) 
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("Test MSE: ", round( metrics.mean_squared_error(y_test, y_pred) , 3))
    print("Test MAPE: ", round( metrics.mean_absolute_percentage_error(y_test, y_pred) , 3))
    print("Test MAE: ", round( metrics.median_absolute_error(y_test, y_pred) , 3))
    print("Unusual behavior: MSE is higher when using same train and test.")

    plt.rcParams['figure.dpi'] = dict_font["fig_dpi"] * 1.5
    fig = plt.figure(figsize = np.array(dict_font["fig_size"]) * [2,1.5])
    t = tree.plot_tree(clf, filled = True, feature_names = X_train.columns.to_list(), 
                       rounded = True, proportion = False)
    
    return

### Time series

Before calling this function is necessary to specify a city name and a variable name.
If the names insert are not plausible the function outputs a warning error message.

Moreover, since temperature variables in the original <i>csv</i> files are in Kelvin, they are transformed to Celsius.

Adding 3 lines to the graph in correspondence to relevant days.

In [7]:
def my_plot_ts(city, var, sub_df, opts, scale = 1):
    if var not in opts["Var"].to_list():
        print("Please insert one var in: ", ', '.join(opts["Var"].to_list()))
    elif city not in sub_df["City"].to_list():
        print("Please insert one city in: ", ", ".join(list(sub_df["City"])))
    else:
        df = pd.read_csv(path_raw + delimiter + city + ".csv")
        df[['date','hour']] = df.date.str.split(" ", expand=True)
        df = df.drop("hour", axis = 1) # axis 1 is column
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date')
        
        means = df.groupby('date').mean()
        series = means[var]
        if var in ["t2m", "d2m"]: 
            series = series - 273.15
            
        # strftime converts datetime to string, strptime converts string to datetime
        last_day_collection = datetime.datetime.strftime(max(sub_df["Collect"]), format = "%Y-%m-%d")
        series = series["2019-11-01":last_day_collection]

        first_death = sub_df["Collect"].loc[sub_df["City"]==city]
        #first_death_string = first_death.astype(str).to_list()[0]
        
        plt.plot(series, color = "black")
        
        plt.axvline(x = first_death, color = "red", label = "First")
        plt.axvline(x = (first_death - datetime.timedelta(20)), color = "black", 
                    linestyle = '--', label = "20 days before")
        plt.axvline(x = (first_death - datetime.timedelta(30)), color = "black", 
                    linestyle = 'dotted', label = "30 days before")
        lgd = plt.legend(loc = 'best', fontsize = dict_font["font_txt"] * scale)
        lgd.set_title(title = 'Deaths', prop = {'size': dict_font["font_title"] * scale})
        
        plt.xlabel('Date', fontsize = dict_font["font_xy"] * scale)
        plt.xticks(rotation = 'vertical', fontsize = dict_font["font_txt"] * scale)
        plt.yticks(fontsize = dict_font["font_txt"] * scale)
        
        my_title = opts["Label"].loc[opts["Var"] == var].to_list()[0]
        plt.title(city + " " + my_title + " from 2019-01-11 to " + last_day_collection, fontsize = dict_font["font_title"] * scale)

        my_ylab = opts["Unit"].loc[opts["Var"] == var].to_list()[0]
        plt.ylabel(my_ylab, fontsize = dict_font["font_xy"] * scale)
        
        return plt.show()