# WEO data cleaning and reshaping

This notebook loads WEO data, filters to selected indicators, cleans numeric values, pivots to a panel (Country Ã— Year) with indicators as columns, adds a global recession flag, drops incomplete years, and writes a cleaned CSV.

In [None]:
import pandas as pd
import csv
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

p = Path(r"data.csv")
if not p.exists():
    raise FileNotFoundError(p)

# detect encoding (fallback to latin-1) and sample for delimiter sniffing
encoding = "utf-8"
try:
    sample = p.read_text(encoding=encoding)[:8192]
except UnicodeDecodeError:
    encoding = "latin-1"
    sample = p.read_text(encoding=encoding)[:8192]

try:
    delim = csv.Sniffer().sniff(sample).delimiter
except Exception:
    delim = ","

df = pd.read_csv(p, sep=delim, encoding=encoding, low_memory=False, parse_dates=True)
print("shape:", df.shape)
df.head()

In [None]:
# count columns
print("Number of columns:", df.shape[1])

# list column names
print("Columns:", df.columns.tolist())

# counts of non-null values per column (good for spotting missing data)
print(df.count().sort_values(ascending=False))

Lets drop unnecessary columns

In [None]:
df.drop(columns=["WEO Country Code", "ISO","Country/Series-specific Notes", "Subject Notes", "Units", "Scale", "Estimates Start After", "Subject Descriptor" ], inplace=True)

codes = {"NGDPD","NGSD_NGDP","PCPI","TM_RPCH","TX_RPCH","LUR","LP","GGR_NGDP","GGX_NGDP","GGXCNL_NGDP","GGSB_NPGDP","GGXONLB_NGDP","GGXWDN_NGDP","BCA_NGDPD"}
col = "WEO Subject Code"

if col not in df.columns:
    raise KeyError(f"Column {col!r} not found in dataframe")

df = df[df[col].astype(str).str.strip().isin(codes)].copy()
print("shape after filter:", df.shape)
df

Clean up the data

In [None]:
# --- Step 1: Identify year columns ---
year_cols = df.columns[2:]  # after WEO Subject Code and Country

# --- Step 2: Clean numeric formatting ---
df[year_cols] = df[year_cols].replace({',': ''}, regex=True)
df[year_cols] = df[year_cols].apply(pd.to_numeric, errors="coerce")

# --- Step 3: Clean country names ---
df["Country"] = (
    df["Country"]
    .str.replace(" ", "_")
    .str.replace("'", "")
    .str.replace("-", "_")
)

# --- Step 4: Melt into long format ---
df_long = df.melt(id_vars=["WEO Subject Code", "Country"],
                  var_name="Year", value_name="Value")

# --- Step 5: Keep only valid year rows ---
df_long["Year"] = df_long["Year"].astype(str).str.strip()
df_long = df_long[df_long["Year"].str.fullmatch(r"\d{4}")].copy()
df_long["Year"] = df_long["Year"].astype(int)

# --- Step 6: Clean numeric values ---
df_long["Value"] = (
    df_long["Value"].astype(str)
    .str.replace(",", "")
    .replace({"": None, "nan": None})
    .astype(float)
)

# --- Step 7: Pivot using WEO Subject Code as columns ---
df_pivot = df_long.pivot_table(
    index=["Country", "Year"],
    columns="WEO Subject Code",   # use NGDPD, PCPI, etc.
    values="Value",
    aggfunc="first"
).reset_index()

# --- Step 8: Finalize ---
df_pivot.columns.name = None
df_pivot = df_pivot.set_index("Year")

df_pivot.head()


# Add recession years and drop NA

In [None]:
# Convert Year index to integer
df_pivot.index = df_pivot.index.astype(int)

# Define global recession years
global_recession_years = [
    1981, 1982, 1983,
    1990, 1991, 1992, 1993,
    2008, 2009, 2010,
    2020, 2021
]

# Add Global_Recession column
df_pivot["Global_Recession"] = df_pivot.index.isin(global_recession_years).astype(int)

# Remove rows with any NaN values
df_pivot = df_pivot.dropna()
df_pivot


Lets take a look at which countries remain

In [None]:
df_pivot["Country"].unique()

Filter years up and include 2024. Also leave a set of prediction years of 2025 and after.

In [None]:
df_predict = df_pivot.loc[df_pivot.index > 2024]
df_predict = df_predict.drop(columns=["Global_Recession", "Country"])
df_predict

In [None]:
df_filtered = df_pivot.loc[df_pivot.index <= 2024]
df_filtered

Lets make a heatmap and see how well data is correlated with eachother

In [None]:
# --- 3. Exploratory Plots ---
# Heatmap
plt.figure(figsize=(10,8))
corr = df_filtered.drop(columns=["Country"]).corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap of Economic Indicators")
plt.show()

create and a x and y dataset so that we can train our models

In [None]:
# Assuming df_filtered is your dataframe with Year as index
X = df_filtered.drop(columns=["Global_Recession", "Country"])  # features
y = df_filtered["Global_Recession"]                           # target

# Modelling portion on full dataset

In [None]:
# Model imports (data/numeric/visual imports are at the top of the notebook)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# --- Train models (parameters specified) ---
logit = make_pipeline(StandardScaler(), LogisticRegression(C=10, max_iter=10000, random_state=42))
logit.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)

svc = make_pipeline(StandardScaler(), LinearSVC(C=10, max_iter=10000, dual=False, random_state=42))
svc.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

nb = GaussianNB()
nb.fit(X_train, y_train)

mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

ensemble = VotingClassifier(
    estimators=[("logit", logit.named_steps['logisticregression']), ("rf", rf), ("gb", gb)],
    voting="soft"
)
ensemble.fit(X_train, y_train)

# --- Collect training & test accuracy ---
results = {
    "Logistic Regression": [
        accuracy_score(y_train, logit.predict(X_train)),
        accuracy_score(y_test, logit.predict(X_test))
    ],
    "Random Forest": [
        accuracy_score(y_train, rf.predict(X_train)),
        accuracy_score(y_test, rf.predict(X_test))
    ],
    "Gradient Boosting": [
        accuracy_score(y_train, gb.predict(X_train)),
        accuracy_score(y_test, gb.predict(X_test))
    ],
    "Linear SVM (scaled)": [
        accuracy_score(y_train, svc.predict(X_train)),
        accuracy_score(y_test, svc.predict(X_test))
    ],
    "KNN": [
        accuracy_score(y_train, knn.predict(X_train)),
        accuracy_score(y_test, knn.predict(X_test))
    ],
    "Naive Bayes": [
        accuracy_score(y_train, nb.predict(X_train)),
        accuracy_score(y_test, nb.predict(X_test))
    ],
    "MLP": [
        accuracy_score(y_train, mlp.predict(X_train)),
        accuracy_score(y_test, mlp.predict(X_test))
    ],
    "Ensemble (RF+Logit+GB)": [
        accuracy_score(y_train, ensemble.predict(X_train)),
        accuracy_score(y_test, ensemble.predict(X_test))
    ]
}

summary_df = pd.DataFrame(results, index=["Training Accuracy", "Test Accuracy"]).T
print(summary_df)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Logistic Regression Feature Importance ---
coef = logit.named_steps['logisticregression'].coef_[0]
logit_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": coef,
    "Abs_Importance": np.abs(coef)
}).sort_values(by="Abs_Importance", ascending=False)

# --- Random Forest Feature Importance ---
rf_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# --- Gradient Boosting Feature Importance ---
gb_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": gb.feature_importances_
}).sort_values(by="Importance", ascending=False)

# --- Linear SVM Feature Importance ---
coef_svm = svc.named_steps['linearsvc'].coef_[0]
svc_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": coef_svm,
    "Abs_Importance": np.abs(coef_svm)
}).sort_values(by="Abs_Importance", ascending=False)

# --- Plot all in one window ---
fig, axes = plt.subplots(2, 2, figsize=(16,12))

# Logistic Regression
logit_sorted = logit_importance.sort_values(by="Abs_Importance", ascending=True)
axes[0,0].barh(logit_sorted["Feature"], logit_sorted["Coefficient"], color="skyblue")
axes[0,0].set_title("Logistic Regression Feature Importance")
axes[0,0].axvline(0, color="black", linewidth=0.8)

# Random Forest
rf_sorted = rf_importance.sort_values(by="Importance", ascending=True)
axes[0,1].barh(rf_sorted["Feature"], rf_sorted["Importance"], color="skyblue")
axes[0,1].set_title("Random Forest Feature Importance")

# Gradient Boosting
gb_sorted = gb_importance.sort_values(by="Importance", ascending=True)
axes[1,0].barh(gb_sorted["Feature"], gb_sorted["Importance"], color="skyblue")
axes[1,0].set_title("Gradient Boosting Feature Importance")

# Linear SVM
svc_sorted = svc_importance.sort_values(by="Abs_Importance", ascending=True)
axes[1,1].barh(svc_sorted["Feature"], svc_sorted["Coefficient"], color="skyblue")
axes[1,1].set_title("Linear SVM Feature Importance")
axes[1,1].axvline(0, color="black", linewidth=0.8)

plt.tight_layout()
plt.show()


In [None]:
# --- Collect models ---
models = {
    "Logistic Regression": logit,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Linear SVM (scaled)": svc,
    "KNN": knn,
    "Naive Bayes": nb,
    "MLP": mlp,
    "Ensemble (RF+Logit+GB)": ensemble
}

# --- Run predictions for all models in one go ---
predictions = pd.DataFrame(
    {name: model.predict(df_predict) for name, model in models.items()},
    index=df_predict.index
)

# --- Show results ---
print(predictions)      # first few rows

