# PROJECT ASSIGNMENT #2
### GROUP MEMBERS: Neil Chen, Will Wang

In [None]:
import pandas as pd
import numpy as np
df_indicators = pd.read_csv('country_indicators.csv')
df_indicators

In [None]:
df_preds = pd.read_csv('test_predictions.csv')
df_preds

In [None]:
df = df_preds.merge(df_indicators, left_on='iso3', right_on='iso3', how='inner')
df

In [None]:
from sklearn import metrics
import seaborn as sns

column_defining_groups = 'fsi_category'

chosen_metric = 'accuracy_score'

metric_function = getattr(metrics, chosen_metric)

chosen_metric_for_chosen_groups = list()
chosen_metric_back_into_original_data = {'ffnn': pd.Series(index=df.index, dtype=np.float64), 
                                         'xgboost': pd.Series(index=df.index, dtype=np.float64), 
                                         'transformer': pd.Series(index=df.index, dtype=np.float64)}
    
for g, rows in df.groupby(column_defining_groups):
    for model in ['ffnn', 'xgboost', 'transformer']:        
        chosen_metric_value = metric_function(rows[f"y_true_{model}"], rows[f"y_pred_{model}"]) 
        
        chosen_metric_back_into_original_data[model][rows.index] = chosen_metric_value
        chosen_metric_for_chosen_groups.append((model, g, chosen_metric_value))

cm_bigger_better = sns.light_palette("teal", as_cmap=True)

styler = ( pd.DataFrame(chosen_metric_for_chosen_groups, 
                        columns=('model', column_defining_groups, chosen_metric))
            .sort_values(column_defining_groups).style
            .background_gradient(cmap=cm_bigger_better)
            .format(precision=5).hide(axis="index"))
styler

In [None]:
import plotly.express as px

fig = px.choropleth(df, 
                    locations='iso3',
                    color='fsi_category',
                    hover_name='iso3',
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

# ***HYPOTHESIS TEST*** $1$
### (between all three models)


We believe that the probability that all three models should accurately predict a true positive (correctly predicted escalation) or a true negative (correctly predicted no escalation) for the fsi category 'sustainable' is 0.5. This is because the model will either have to choose if a country has conflict escalation (positive) or no escalation (negative) and therefore, has a 0.5 chance that it is true and a 0.5 chance it is false. 

- Let t be 'correctly predicted escalation and no escalation' under **transformer** model for fsi category 'Sustainable'
- Let f be 'correctly predicted escalation and no escalation' under **ffnn** model for fsi category 'Sustainable' 
- Let x be 'correctly predicted escalation and no escalation' under **xgboost** model for fsi category 'Sustainable'
- Let c be 'correctly predicted escalation and no escalation' under **all three models** for fsi category 'Sustainable'

$H_0: p_c = p_t + p_f + p_x = 0.5$

$H_1$: Not $H_0$

In [None]:
df.fsi_category.value_counts() #to count the number of countries with sustainable fsi_category

## To Calculate ($p_t$): The Proportion of 'Correctly Predicted Escalation and No Escalation under transformer model for fsi category 'Sustainable' 

In [None]:
threshold = 0.5 # using default classification threshold

df_preds['transformer_classification_performance_outcome'] = 0.5

tmpt = df_preds['transformer_classification_performance_outcome'].copy()
TP_pos_pred_correct = df_preds.y_true_transformer & (df_preds.y_pred_proba_transformer>threshold)
tmpt[TP_pos_pred_correct] = "correctly predicted escalation"
TN_neg_pred_correct = (~df_preds.y_true_transformer) & (df_preds.y_pred_proba_transformer<=threshold)
tmpt[TN_neg_pred_correct] = "correctly predicted no escalation"

df_preds['transformer_classifcation_performance_outcome'] = tmpt
df_preds[['y_true_transformer','y_pred_transformer','transformer_classification_performance_outcome']]

In [None]:
pt_TP_pos_pred_correct = (tmpt[TP_pos_pred_correct].groupby(df.fsi_category))
pt_TP_pos_pred_correct.value_counts()/364

In [None]:
pt_TN_neg_pred_correct = (tmpt[TN_neg_pred_correct].groupby(df.fsi_category))
pt_TN_neg_pred_correct.value_counts()/364

### $p_t = 0.002747 + 0.087912 = 0.090659$

Let $p_t$ be the observed proportion of true positives under the ffnn model for sustainable fsi category plus the proportion of true negatives under the transformer model for sustainable fsi category

## To Calculate ($p_f$): The Proportion of 'Correctly Predicted Escalation and No Escalation under ffnn model for fsi category 'Sustainable' 

In [None]:
threshold = 0.5 # using default classification threshold

df_preds['ffnn_classification_performance_outcome'] = 0.5

tmpf = df_preds['ffnn_classification_performance_outcome'].copy()
TP_pos_pred_correct = df_preds.y_true_ffnn & (df_preds.y_pred_proba_ffnn>threshold)
tmpf[TP_pos_pred_correct] = "correctly predicted escalation"
TN_neg_pred_correct = (~df_preds.y_true_ffnn) & (df_preds.y_pred_proba_ffnn<=threshold)
tmpf[TN_neg_pred_correct] = "correctly predicted no escalation"

df_preds['ffnn_classifcation_performance_outcome'] = tmpf
df_preds[['y_true_ffnn','y_pred_ffnn','ffnn_classification_performance_outcome']]

In [None]:
pf_TP_pos_pred_correct = (tmpf[TP_pos_pred_correct].groupby(df.fsi_category))
pf_TP_pos_pred_correct.value_counts()/364

In [None]:
pf_TN_neg_pred_correct = (tmpf[TN_neg_pred_correct].groupby(df.fsi_category))
pf_TN_neg_pred_correct.value_counts()/364

### $p_f = 0 + 0.090659 = 0.090659$

Let $p_f$ be the observed proportion of true positives under the ffnn model for sustainable fsi category plus the proportion of true negatives under the ffnn model for sustainable fsi category

## To Calculate ($p_x$): The Proportion of 'Correctly Predicted Escalation and No Escalation under xgboost model for fsi category 'Sustainable' 

In [None]:
threshold = 0.5 # using default classification threshold

df_preds['xgboost_classification_performance_outcome'] = 0.5

tmpx = df_preds['xgboost_classification_performance_outcome'].copy()
TP_pos_pred_correct = df_preds.y_true_xgboost & (df_preds.y_pred_proba_xgboost>threshold)
tmpx[TP_pos_pred_correct] = "correctly predicted escalation"
TN_neg_pred_correct = (~df_preds.y_true_xgboost) & (df_preds.y_pred_proba_xgboost<=threshold)
tmpx[TN_neg_pred_correct] = "correctly predicted no escalation"

df_preds['xgboost_classifcation_performance_outcome'] = tmpx
df_preds[['y_true_xgboost','y_pred_xgboost','xgboost_classification_performance_outcome']]

In [None]:
px_TP_pos_pred_correct = (tmpx[TP_pos_pred_correct].groupby(df.fsi_category))
px_TP_pos_pred_correct.value_counts()/364

In [None]:
px_TN_neg_pred_correct = (tmpx[TN_neg_pred_correct].groupby(df.fsi_category))
px_TN_neg_pred_correct.value_counts()/364

### $p_x = 0 + 0.079670 = 0.079670$

Let $p_x$ be the observed proportion of true positives under the xgboost model for sustainable fsi category plus the proportion of true negatives under the xgboost model for sustainable fsi category

### Therefore, the observed test statistic would be the total observed proportion of true positives and true negatives for the sustainable fsi_category for all three models which is given by the equation:

$p_c$ (observed) = $p_x$ (observed) + $p_f$ (observed) + $p_t$ (observed)

$p_c$ (observed) = 0.079670 + 0.090659 + 0.090659

$p_c$ (observed) = 0.260988

In [None]:
simulated_samples, n = 10000, 35 #n=35 since there are 35 countries with the sustainable fsi_category (as seen in ln6)
p_hats = [] 
p = 0.5 # Null Hypothesis assumed proportion
np.random.seed(100) 
for i in range(simulated_samples): 
    sample = np.random.choice(['True Positive and True Negative', 'False Positive and False Negative'], size=n, replace=True, p=[p, 1-p])
    sample_prop = (sample == 'True Positive and True Negative').sum() / n
    p_hats.append(sample_prop)

observed_test_stat = 0.260988
p = 0.5
num_more_extreme = (abs(np.array(p_hats) - p) >= abs(observed_test_stat - p)).sum()
p_value = num_more_extreme / simulated_samples
p_value

Therefore, since we got a **p-value of 0.0051**, this shows that there is **very strong evidence against the null** which means that the probability that all three models (xgboost, ffnn and transformer) accurately predicted escalation or no escalation is **not equal to 0.5**. In reality, it can be seen that the probability of true positives and true negatives for the fsi category 'sustainable' is approximately 0.905 (the average of the accuracy score for all three models for the given category).

# This concludes the Hypothesis Test $1$.

In [None]:
#Visual aid for 'wbi_income_group' catagory for acuracy

import numpy as np
from sklearn import metrics
import seaborn as sns

column_defining_groups = 'wbi_income_group'
chosen_metric = 'accuracy_score'

metric_function = getattr(metrics, chosen_metric)

chosen_metric_for_chosen_groups = list()
chosen_metric_back_into_original_data = {'ffnn': pd.Series(index=df.index, dtype=np.float64), 
                                         'xgboost': pd.Series(index=df.index, dtype=np.float64), 
                                         'transformer': pd.Series(index=df.index, dtype=np.float64)}
    
for g, rows in df.groupby(column_defining_groups):
    for model in ['ffnn', 'xgboost', 'transformer']:        
        chosen_metric_value = metric_function(rows[f"y_true_{model}"], rows[f"y_pred_{model}"])

        
        chosen_metric_back_into_original_data[model][rows.index] = chosen_metric_value
        chosen_metric_for_chosen_groups.append((model, g, chosen_metric_value))

cm_bigger_better = sns.light_palette("pink", as_cmap=True)
styler = ( pd.DataFrame(chosen_metric_for_chosen_groups, 
                        columns=('model', column_defining_groups, chosen_metric))
            .sort_values(column_defining_groups).style
            .background_gradient(cmap=cm_bigger_better)
            .format(precision=3).hide(axis="index"))
styler

In [None]:
#map of wbi_income_group desplaying rows in colours
import plotly.express as px
fig = px.choropleth(df,
                    locations='iso3',
                    color='wbi_income_group',
                    hover_name='iso3',
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

In [None]:
#confidence intervals
import numpy as np
from sklearn import metrics
import seaborn as sns
import pandas as pd

df_visualization = pd.DataFrame(chosen_metric_for_chosen_groups, columns=('model', column_defining_groups, chosen_metric))

def calculate_confidence_interval(data):
    lower_bound = np.quantile(data, 0.025)
    upper_bound = np.quantile(data, 0.975)
    return lower_bound, upper_bound
conf_intervals = df_visualization.groupby(column_defining_groups)[chosen_metric].apply(calculate_confidence_interval)
conf_intervals

## Confidance interval of low income
 ***(0.2620689655172414, 0.5405172413793103)***

Roughly: ***(0.26, 0.54)***

# ***HYPOTHESIS TEST*** $2$
The hypothesis assumes each model's (transformer, ffnn, and xgboost) error score of low income in countries is the same.

***Introducing valuables:***

- Let t be 'wrongly predicted escalation' under **transformer** model for fsi category 'Low income'
- Let f be 'wrongly predicted escalation' under **ffnn** model for fsi category 'Low income' 
- Let x be 'wrongly predicted escalation' under **xgboost** model for fsi category 'Low income'
- Let c be 'wrongly predicted escalation ' under **all three models** for fsi category 'Low income'

# $H_0: p_c = p_t + p_f + p_x = 0.5$

# Alternative Hypothesis
the alternative hypothesis assumes that at least on the the models has a different score

# $H_A: H_1: H_0 {is false}$

In [None]:
#this code finds all the catagories of the df that have dtype 'object'
list(df_indicators.select_dtypes(['object']).columns)

In [None]:
#this code counts the number of rows that have data for low income
df_indicators.wbi_income_group.value_counts()

In [None]:
# Prediction Classification 'falsly predicted'(transformer model)

threshold = 0.5 # using default classification threshold


df_preds['transformer_classifcation_performance_outcome'] = 0.5

tmpt = df_preds['transformer_classifcation_performance_outcome'].copy()

FP_pos_pred_wrong = (~df_preds.y_true_transformer) & (df_preds.y_pred_proba_transformer>threshold)
tmpt[FP_pos_pred_wrong] = "wrongly predicted escalation"
FN_neg_pred_wrong = df_preds.y_true_transformer & (df_preds.y_pred_proba_transformer<=threshold)
tmpt[FN_neg_pred_wrong] = "wrongly predicted no escalation"

df_preds['transformer_classifcation_performance_outcome'] = tmpt
df_preds[['y_true_transformer','y_pred_transformer','transformer_classifcation_performance_outcome']]

In [None]:
#wrongly predicted positives (false positives)
wrong_predictions_t = (tmpt[FP_pos_pred_wrong].groupby(df.wbi_income_group))
wrong_predictions_t.value_counts()/364

In [None]:
#wrongly predicted negatives (false negatives)
wrong_predictions_t = (tmpt[FN_neg_pred_wrong].groupby(df.wbi_income_group))
wrong_predictions_t.value_counts()/364

 #### Wrongly predicting (false positive and false negative) ###
 '***low income***' in countries with ***transformer*** model w false positive= ***0.063187*** and false negative is= ***0.008242***
 
 $p_t= 0.063187+0.008242$ = ***0.145607*** (observed proportions)

In [None]:
# Prediction Classification 'falsly predicted'(ffnn model)

threshold = 0.5 
df_preds['ffnn_classifcation_performance_outcome'] = 0.5

tmp = df_preds['ffnn_classifcation_performance_outcome'].copy()

FP_pos_pred_wrong = (~df_preds.y_true_ffnn) & (df_preds.y_pred_proba_ffnn>threshold)
tmp[FP_pos_pred_wrong] = "wrongly predicted escalation"
FN_neg_pred_wrong = df_preds.y_true_ffnn & (df_preds.y_pred_proba_ffnn>threshold)
tmp[FN_neg_pred_wrong] = "wrongly predicted no escalation"

df_preds['ffnn_classifcation_performance_outcome'] = tmp
df_preds[['y_true_ffnn','y_pred_proba_ffnn','ffnn_classifcation_performance_outcome']]

In [None]:
#wrongly predicted positives (false positives)
wrong_predictions_f = (tmp[FP_pos_pred_wrong].groupby(df.wbi_income_group))
wrong_predictions_f.value_counts()/364

In [None]:
#wrongly predicted negatives (false negatives)
wrong_predictions_f = (tmp[FN_neg_pred_wrong].groupby(df.wbi_income_group))
wrong_predictions_f.value_counts()/364

In [None]:
# Prediction Classification 'falsly predicted'(xgboost model)

threshold = 0.5 # using default classification threshold

# using transformer for this example demonstration, as opposed to
# df_preds.y_true_ffnn and df_preds.y_pred_proba_ffnn>threshold
# df_preds.y_true_xgboost and df_preds.y_pred_proba_xgboost>threshold
df_preds['xgboost_classifcation_performance_outcome'] = 0.5

tmpr = df_preds['xgboost_classifcation_performance_outcome'].copy()

FP_pos_pred_wrong = (~df_preds.y_true_xgboost) & (df_preds.y_pred_proba_xgboost>threshold)
tmpr[FP_pos_pred_wrong] = "wrongly predicted escalation"
FN_neg_pred_wrong = df_preds.y_true_xgboost & (df_preds.y_pred_proba_xgboost<=threshold)
tmpr[FN_neg_pred_wrong] = "wrongly predicted no escalation"

df_preds['xgboost_classifcation_performance_outcome'] = tmp
df_preds[['y_true_xgboost','y_pred_proba_xgboost','xgboost_classifcation_performance_outcome']]

 #### Wrongly predicting (false positive and false negative) ###
 '***low income***' in countries with ***ffnn*** model with false positive= ***0.118132*** and false negative is= ***0.038462***
 
 $p_f= 0.118132+0.038462$ = ***0.156594*** (observed proportions)

In [None]:
# Prediction Classification "Correctness" (xgboost model)

threshold = 0.5 # using default classification threshold

# using transformer for this example demonstration, as opposed to
# df_preds.y_true_ffnn and df_preds.y_pred_proba_ffnn>threshold
# df_preds.y_true_xgboost and df_preds.y_pred_proba_xgboost>threshold
df_preds['xgboost_classifcation_performance_outcome'] = 0.5

tmpr = df_preds['xgboost_classifcation_performance_outcome'].copy()

FP_pos_pred_wrong = (~df_preds.y_true_xgboost) & (df_preds.y_pred_proba_xgboost>threshold)
tmpr[FP_pos_pred_wrong] = "wrongly predicted escalation"
FN_neg_pred_wrong = df_preds.y_true_xgboost & (df_preds.y_pred_proba_xgboost<=threshold)
tmpr[FN_neg_pred_wrong] = "wrongly predicted no escalation"

df_preds['xgboost_classifcation_performance_outcome'] = tmp
df_preds[['y_true_xgboost','y_pred_proba_xgboost','xgboost_classifcation_performance_outcome']]

In [None]:
#wrongly predicted positives (false positives)
wrong_predictions_f = (tmpr[FP_pos_pred_wrong].groupby(df.wbi_income_group))
wrong_predictions_f.value_counts()/364

In [None]:
#wrongly predicted negatives (false negatives)
wrong_predictions_f = (tmpr[FN_neg_pred_wrong].groupby(df.wbi_income_group))
wrong_predictions_f.value_counts()/364

 #### Wrongly predicting (false positive and false negative) ###
 '***low income***' in countries with ***ffnn*** model with false positive= ***0.107143*** and false negative is= ***0***
 
 $p_x = 0.107143+0$ = ***0.107143*** (observed proportions)

## Total  of observed test statistic:

$p_c$ (observed) = $p_x$ (observed) + $p_f$ (observed) + $p_t$ (observed)

$p_c$ (observed) = 0.145607 + 0.156594 + 0.107143

$p_c$ (observed) = ***0.409344***

# Hypothesis test


In [None]:
simulated_samples, n = 10000, 35 #n=35 since there are 35 countries with the sustainable fsi_category (as seen in ln6)
p_hats = [] 
p = 0.5 # Null Hypothesis assumed proportion
np.random.seed(100) 
for i in range(simulated_samples): 
    sample = np.random.choice(['True Positive and True Negative', 'False Positive and False Negative'], size=n, replace=True, p=[p, 1-p])
    sample_prop = (sample == 'False Positive and False Negative').sum() / n
    p_hats.append(sample_prop)

observed_test_stat = 0.409344
p = 0.5
num_more_extreme = (abs(np.array(p_hats) - p) >= abs(observed_test_stat - p)).sum()
p_value = num_more_extreme / simulated_samples
p_value

The hypothesis test concludes there is ***no evidance against*** the null hypothesis because the ***p-value*** is ***0.3139*** > 0.10. Concluding that there was no evidance to contridict the assumption that the models produce the same error results of 0.5.

# This Concludes Hypothesis Test $2$.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("country_indicators.csv")

# Step 1: Drop redundant index column
df.drop(columns=["Unnamed: 0"], inplace=True)

# Step 2: Check missing value percentages
missing_percent = df.isnull().mean()

# Step 3: Drop columns with more than 50% missing values
threshold = 0.5
df = df.loc[:, missing_percent <= threshold]

# Step 4: Separate and encode the target variable
target = df["fsi_category"]
df = df.drop(columns=["fsi_category"])

# Step 5: Drop rows where target is missing
mask = target.notnull()
df = df[mask]
target = target[mask]

# Step 6: Fill remaining missing values with column means
df = df.fillna(df.mean(numeric_only=True))

# Step 7: Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target)

# Final features and labels
X_final = df
y_final = y_encoded

# (Optional) Print dataset shape and label classes
print("Final dataset shape:", X_final.shape)
print("Target classes:", label_encoder.classes_)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("country_indicators.csv")

# Step 1: Drop redundant index column
df.drop(columns=["Unnamed: 0"], inplace=True)

# Step 2: Check missing value percentages
missing_percent = df.isnull().mean()

# Step 3: Drop columns with more than 50% missing values
threshold = 0.5
df = df.loc[:, missing_percent <= threshold]

# Step 4: Separate and encode the target variable
target = df["fsi_category"]
df = df.drop(columns=["fsi_category"])

# Step 5: Drop rows where target is missing
mask = target.notnull()
df = df[mask]
target = target[mask]

# Step 6: Fill remaining missing values with column means
df = df.fillna(df.mean(numeric_only=True))

# Step 7: Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target)

# Final features and labels
X_final = df
y_final = y_encoded

# (Optional) Print dataset shape and label classes
print("Final dataset shape:", X_final.shape)
print("Target classes:", label_encoder.classes_)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("country_indicators.csv")

# Step 1: Drop redundant index column
df.drop(columns=["Unnamed: 0"], inplace=True)

# Step 2: Check missing value percentages
missing_percent = df.isnull().mean()

# Step 3: Drop columns with more than 50% missing values
threshold = 0.5
df = df.loc[:, missing_percent <= threshold]

# Step 4: Separate and encode the target variable
target = df["fsi_category"]
df = df.drop(columns=["fsi_category"])

# Step 5: Drop rows where target is missing
mask = target.notnull()
df = df[mask]
target = target[mask]

# Step 6: Fill remaining missing values with column means
df = df.fillna(df.mean(numeric_only=True))

# Step 7: Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target)

# Final features and labels
X_final = df
y_final = y_encoded

# (Optional) Print dataset shape and label classes
print("Final dataset shape:", X_final.shape)
print("Target classes:", label_encoder.classes_)
