In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [None]:
out_dir = Path("output")
out_dir.exists()

In [None]:
data = pd.read_csv(out_dir / "data_merged.csv")
len(np.unique(data["Entity"]))

In [None]:
data_max = data.groupby("Entity").max().reset_index()
data_max

In [None]:
data_max[["Population", "GDP"]]

In [None]:
g1 = sns.PairGrid(data_max, vars = ["Export", "Import", "Net Trade", "GDP", "Population"])
g1.map_diag(sns.histplot)
g1.map_offdiag(sns.scatterplot)

export and import are correlated

In [None]:
g2 = sns.PairGrid(data_max, vars = ["Education Expenditure", "Health Expenditure",  "R&D", "GDP"])
g2.map_diag(sns.histplot)
g2.map_offdiag(sns.scatterplot)

In [None]:
feature_names = ["GDP", "R&D", "Population", "Land", "Export", "Education Expenditure", "Health Expenditure", "Net Trade"]
df_of_features = data_max[feature_names]
means = df_of_features.mean()
sds = df_of_features.std()
normalized_features = (df_of_features - means)/sds

pd.DataFrame({"mean": means, "sd": sds}).to_csv(out_dir/"features_mean_sd.csv")

In [None]:
pip install -U scikit-learn

In [None]:
array_of_normalized_features = np.array(normalized_features)
pca = PCA(n_components=2)
pca.fit(array_of_normalized_features.T)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

In [None]:
array_of_normalized_features.T

In [None]:
pca.components_

In [None]:
_, ax = plt.subplots()
plt.scatter(pca.components_[0], pca.components_[1])
for i, txt in enumerate(data_max["Entity"]):
    ax.annotate(txt, (pca.components_[0][i] + 0.01, pca.components_[1][i] + 0.01))
plt.savefig(out_dir / "principal_components.png")
plt.tight_layout()
#TODO more beautiful annotations

In [None]:
df_coefficients = pd.read_csv(out_dir / "coefficients.csv")
del df_coefficients["Unnamed: 0"]
df_coefficients1 = df_coefficients.drop(labels = [9], axis = 0)
df_coefficients1

In [None]:
fig1 = plt.figure()
ax = fig1.add_subplot(111, projection = "3d")
ax.scatter(pca.components_[0], pca.components_[1], df_coefficients1["Tan"])
plt.savefig(out_dir / "tangent_on_principal_components.png")

In [None]:
fig2 = plt.figure()
ax = fig2.add_subplot(111, projection = "3d")
ax.scatter(pca.components_[0], pca.components_[1], df_coefficients1["Bias"])
plt.savefig(out_dir / "bias_on_principal_components.png")

# Linear regressor

In [None]:
features_and_entities = df_of_features
features_and_entities.loc[:, "Entity"] = data_max["Entity"] 
merged_df_2 = pd.merge(df_coefficients, features_and_entities, on = ["Entity"])

merged_df_2

In [None]:
#train_df = merged_df_2.loc[list(range(6))]
train_df = merged_df_2.loc[[0, 1, 3, 5, 6, 8]]
train_df

In [None]:
#test_df = merged_df_2.loc[list(range(6, 9))]
test_df = merged_df_2.loc[[2, 4, 7]]
test_df

In [None]:
train_df[feature_names]

In [None]:
train_array = np.array(train_df[feature_names])
test_array = np.array(test_df[feature_names])

In [None]:
reg_tan = LinearRegression().fit(train_array, train_df["Tan"])
reg_tan.score(train_array, train_df["Tan"])

In [None]:
reg_tan.coef_

In [None]:
reg_tan.intercept_

In [None]:
predicted_tans = reg_tan.predict(test_array)

In [None]:
test_df[["Entity", "Tan"]]

In [None]:
test_df

In [None]:
reg_bias = LinearRegression().fit(train_array, train_df["Bias"])

In [None]:
reg_bias.coef_

In [None]:
reg_bias.intercept_

In [None]:
predicted_biases = reg_bias.predict(test_array)
predicted_biases

In [None]:
predicted_tans

In [None]:
test_df[["Entity", "Bias"]]

In [None]:
linear_regression_coefficients_df = pd.DataFrame({'Tangent': reg_tan.coef_, 'Bias': reg_bias.coef_}, index = feature_names)

linear_regression_coefficients_dfT = linear_regression_coefficients_df.T
linear_regression_coefficients_dfT["intercept"] = [reg_tan.intercept_, reg_bias.intercept_]
linear_regression_coefficients_dfT.to_csv(out_dir / "linear_regression_coefficients.csv")

In [None]:
metric_tan = sum((predicted_tans - test_df["Tan"]) ** 2)
metric_tan

In [None]:
metric_biases = sum((predicted_biases - test_df["Bias"]) ** 2)
metric_biases

# Leave-one-out cross validation

In [None]:
def leave_one_out(df : pd.DataFrame) -> pd.DataFrame:
    metric_tan = 0
    metric_bias = 0
    
    metrics_tan = []
    metrics_bias = []
    
    num_of_rows = len(df)
    
    for i in range(num_of_rows):
        test_df = df.loc[[i]]
        train_indices = list(range(num_of_rows))
        train_indices.pop(i)
        train_df = df.loc[train_indices]
    
        test_row = np.array(test_df[feature_names])
        train_array = np.array(train_df[feature_names])
    
        reg_tan = LinearRegression().fit(train_array, train_df["Tan"])
        reg_bias = LinearRegression().fit(train_array, train_df["Bias"])
    
        predicted_tan = reg_tan.predict(test_row)
        predicted_bias = reg_bias.predict(test_row)
    
        metric_tan_i = (predicted_tan - test_df["Tan"]) ** 2
        metric_bias_i = (predicted_bias - test_df["Bias"]) ** 2
    
        metrics_tan.append(metric_tan_i[i])
        metrics_bias.append(metric_bias_i[i])
    result = pd.DataFrame({"Entity": df["Entity"]})
    
    result["metrics_loo_tan"] = metrics_tan
    result["metrics_loo_bias"] = metrics_bias

    return result
    
df_loo_res = leave_one_out(merged_df_2)
df_loo_res

In [None]:
def build_boxplot(df : pd.DataFrame) -> None:
    _, axes = plt.subplots(nrows = 1, ncols = 2)
    df.boxplot(column = "metrics_loo_tan", ax = axes[1])
    df.boxplot(column = "metrics_loo_bias", ax = axes[0])
    plt.suptitle("Boxplots for tan and bias")
    plt.tight_layout()
    plt.show()

In [None]:
build_boxplot(df_loo_res)

In [None]:
merged_df_no_US = merged_df_2.copy()
merged_df_no_US = merged_df_no_US.drop(merged_df_no_US[merged_df_no_US["Entity"] == "United States"].index)
df_loo_res_no_US = leave_one_out(merged_df_no_US)
df_loo_res_no_US

In [None]:
build_boxplot(df_loo_res_no_US)