In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.mappings import PT_value_mapping
from src.TimeSeriesDataset import TimeSeriesDataset as TSD
from src.mappings import value_mapping, ideology_mapping, social_complexity_mapping


In [None]:
dataset = TSD(file_path='../datasets/100_yr_dataset.xlsx')
exchanges_df = pd.read_csv('../datasets/exchanges.csv')

In [None]:
exchange_polities = exchanges_df.PolityID.unique()
dataset.scv['exchange'] = 0
dataset.raw['exchange'] = 0
dataset.scv.loc[dataset.scv.PolityName.isin(exchange_polities), 'exchange'] = 1
dataset.raw.loc[dataset.raw.PolityName.isin(exchange_polities), 'exchange'] = 1

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
pca_cols = ['Pop', 'Terr', 'Cap', 'Hierarchy', 'Government', 'Infrastructure', 'Information', 'Money']
df = dataset.scv[pca_cols]
imputer = KNNImputer(n_neighbors=10, weights='uniform')

# Scale numerical features
scaled_data = scaler.fit_transform(df)

# Apply KNN imputation
imputed_data = imputer.fit_transform(scaled_data)

# Transform back to original scale
final_data = scaler.inverse_transform(imputed_data)

# Convert back to DataFrame with original columns
imputed_df = pd.DataFrame(final_data, columns=df.columns, index=df.index)
dataset.scv_imputed = dataset.scv.copy()
dataset.scv_imputed[pca_cols] = imputed_df[pca_cols]

In [None]:
dataset.compute_PCA(cols = pca_cols, col_name = 'PC', n_cols = 1, n_PCA= len(pca_cols), contributions=True, rescale=True)

In [None]:
import seaborn as sns
sc_cols = ['Pop','Terr','Cap','Hierarchy','Government','Information','Infrastructure','Money']
plot_df = dataset.scv_imputed#.drop_duplicates(subset=sc_cols+['exchange'])



# Create pairplot with custom alpha per exchange
g = sns.pairplot(
    data=plot_df[sc_cols+['exchange']], 
    hue='exchange',
    diag_kind='hist',
    plot_kws={'alpha': 0.3},
    markers='o',
    size=5,
    height=2
)

plt.show()

In [None]:
dataset.imputation_fits.R2.hist()
plt.xlabel('R2')
plt.ylabel('Frequency')
plt.show()

sns.histplot(data=dataset.scv, x='Year', hue='exchange', multiple='stack')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

In [None]:
dataset_exchange = TSD(file_path='../datasets/25_yr_dataset.xlsx')
exchange_polities = exchanges_df.PolityID.unique()
dataset_exchange.scv['exchange'] = 0
dataset_exchange.scv.loc[dataset_exchange.scv.PolityName.isin(exchange_polities), 'exchange'] = 1

dataset_exchange.scv = dataset_exchange.scv.loc[dataset_exchange.scv.PolityName.isin(exchange_polities)]

In [None]:

pca_cols = ['Pop', 'Terr', 'Cap', 'Hierarchy', 'Government', 'Infrastructure', 'Information', 'Money']
dataset_exchange.impute_missing_values(pca_cols)


In [None]:
dataset_exchange.compute_PCA(cols = pca_cols, col_name = 'PCA', n_cols = 2, n_PCA= len(pca_cols))
dataset_exchange.scv_imputed['exchange'] = dataset_exchange.scv['exchange']

In [None]:
import seaborn as sns
sc_cols = ['Pop','Terr','Cap','Hierarchy','Government','Information','Infrastructure','Money']

dataset_exchange.scv_imputed['imputed'] = 0
dataset_exchange.scv_imputed.loc[dataset_exchange.scv[sc_cols].isna().sum(axis=1)>0,'imputed'] = 1
plot_df = dataset_exchange.scv_imputed#.drop_duplicates(subset=sc_cols+['exchange'])

# Create pairplot with custom alpha per exchange
g = sns.pairplot(
    data=plot_df[sc_cols+['imputed']], 
    hue='imputed',
    diag_kind='hist',
    plot_kws={'alpha': 0.3},
    markers='o',
    size=3,
    height=2
)

plt.show()

In [None]:
dataset.imputation_fits.R2.hist()
dataset_exchange.imputation_fits.R2.hist()
plt.xlabel('R2')
plt.ylabel('Frequency')
plt.show()