# WEO data cleaning and reshaping

This notebook loads WEO data, filters to selected indicators, cleans numeric values, pivots to a panel (Country × Year) with indicators as columns, adds a global recession flag, drops incomplete years, and writes a cleaned CSV.

In [None]:
import pandas as pd
import csv
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

p = Path(r"data.csv")
if not p.exists():
    raise FileNotFoundError(p)

# detect encoding (fallback to latin-1) and sample for delimiter sniffing
encoding = "utf-8"
try:
    sample = p.read_text(encoding=encoding)[:8192]
except UnicodeDecodeError:
    encoding = "latin-1"
    sample = p.read_text(encoding=encoding)[:8192]

try:
    delim = csv.Sniffer().sniff(sample).delimiter
except Exception:
    delim = ","

df = pd.read_csv(p, sep=delim, encoding=encoding, low_memory=False, parse_dates=True)
print("shape:", df.shape)
df.head()

## 1) Load CSV

Detect encoding and delimiter, then read the CSV into a DataFrame. This cell expects a local `data.csv` file.

In [None]:
Unique_Country = df['Country'].unique()

## 2) Inspect countries

Quick check to list unique country names so you can verify formatting and spot duplicates or unexpected values.

In [None]:
# count columns
print("Number of columns:", df.shape[1])

# list column names
print("Columns:", df.columns.tolist())

# counts of non-null values per column (good for spotting missing data)
print(df.count().sort_values(ascending=False))

## 3) Column overview

Show number of columns, column names and non-null counts to locate missing-data-heavy columns that may need attention.

In [None]:
df.drop(columns=["WEO Country Code", "ISO","Country/Series-specific Notes", "Subject Notes", "Units", "Scale", "Estimates Start After", "Subject Descriptor" ], inplace=True)

codes = {"NGDPD","NGSD_NGDP","PCPI","TM_RPCH","TX_RPCH","LUR","LP","GGR_NGDP","GGX_NGDP","GGXCNL_NGDP","GGSB_NPGDP","GGXONLB_NGDP","GGXWDN_NGDP","BCA_NGDPD"}
col = "WEO Subject Code"

if col not in df.columns:
    raise KeyError(f"Column {col!r} not found in dataframe")

df = df[df[col].astype(str).str.strip().isin(codes)].copy()
print("shape after filter:", df.shape)
df

## 4) Filter indicators and drop unused columns

We drop metadata columns and filter rows to the indicator codes of interest (NGDPD, PCPI, ...). Keep an eye on column names — mismatches will raise a KeyError.

### Cleaning and Reshaping

This next code cell identifies year columns, coerces values to numeric, standardizes country names, melts the table to long format, cleans the value column, and pivots to a panel with indicators as columns.
Run this section carefully — it performs destructive transformations on `df`.

In [None]:
# --- Step 1: Identify year columns ---
year_cols = df.columns[2:]  # after WEO Subject Code and Country

# --- Step 2: Clean numeric formatting ---
df[year_cols] = df[year_cols].replace({',': ''}, regex=True)
df[year_cols] = df[year_cols].apply(pd.to_numeric, errors="coerce")

# --- Step 3: Clean country names ---
df["Country"] = (
    df["Country"]
    .str.replace(" ", "_")
    .str.replace("'", "")
    .str.replace("-", "_")
)

# --- Step 4: Melt into long format ---
df_long = df.melt(id_vars=["WEO Subject Code", "Country"],
                  var_name="Year", value_name="Value")

# --- Step 5: Keep only valid year rows ---
df_long["Year"] = df_long["Year"].astype(str).str.strip()
df_long = df_long[df_long["Year"].str.fullmatch(r"\d{4}")].copy()
df_long["Year"] = df_long["Year"].astype(int)

# --- Step 6: Clean numeric values ---
df_long["Value"] = (
    df_long["Value"].astype(str)
    .str.replace(",", "")
    .replace({"": None, "nan": None})
    .astype(float)
)

# --- Step 7: Pivot using WEO Subject Code as columns ---
df_pivot = df_long.pivot_table(
    index=["Country", "Year"],
    columns="WEO Subject Code",   # use NGDPD, PCPI, etc.
    values="Value",
    aggfunc="first"
).reset_index()

# --- Step 8: Finalize ---
df_pivot.columns.name = None
df_pivot = df_pivot.set_index("Year")

df_pivot.head()


## 5) Clean numeric values, standardize country names, and reshape

Steps:
- Identify year columns and coerce them to numeric (remove commas).
- Standardize country strings to avoid duplication (underscores, remove quotes/hyphens).
- Melt to long format (WEO Subject Code, Country, Year, Value).
- Keep only rows where Year looks like YYYY and ensure Value is numeric.
- Pivot so each row is Country × Year and columns are indicators.

If duplicate (Country, Year, Indicator) rows exist, the code uses `aggfunc='first'`. Inspect duplicates if results look wrong.

In [None]:
# Convert Year index to integer
df_pivot.index = df_pivot.index.astype(int)

# Define global recession years
global_recession_years = [
    1981, 1982, 1983,
    1990, 1991, 1992, 1993,
    2008, 2009, 2010,
    2020, 2021
]

# Add Global_Recession column
df_pivot["Global_Recession"] = df_pivot.index.isin(global_recession_years).astype(int)

df_pivot


### Modeling pipeline

Prepare features `X` and target `y` and run a set of classification models (Logistic Regression, Random Forest, Gradient Boosting, SVM, KNN, Naive Bayes, MLP).
This section performs train/test splitting and model training — ensure `df_filtered` and `X`/`y` are built from the cleaned `df_pivot` before running.

## 6) Add global recession flag

Convert the Year index to integer and add a `Global_Recession` column marking selected years. Adjust the list of recession years if your definition differs.

In [None]:
# Remove rows with any NaN values
df_pivot = df_pivot.dropna()
df_pivot

## 7) Remove incomplete rows

Dropping all rows with any NaN removes years/countries that lack any indicator value. Consider imputing or relaxing this if you need more coverage.

In [None]:
df_pivot["Country"].unique()

In [None]:
# Map countries to continents and create continent-specific DataFrames
# Attempt to use pycountry + pycountry_convert if available, otherwise use fallback mapping
try:
    import pycountry
    import pycountry_convert as pc
    
    def country_to_continent(name):
        try:
            lookup_name = name.replace('_', ' ')
            country = pycountry.countries.lookup(lookup_name)
            alpha2 = country.alpha_2
            cc = pc.country_alpha2_to_continent_code(alpha2)
            continent_map = {
                'AF': 'Africa',
                'AS': 'Asia',
                'EU': 'Europe',
                'NA': 'North_America',
                'OC': 'Oceania',
                'SA': 'South_America',
                'AN': 'Antarctica'
            }
            return continent_map.get(cc, 'Unknown')
        except Exception:
            return 'Unknown'
except ImportError:
    # Fallback mapping for common countries (extend as needed)
    fallback = {
        'United_States': 'North_America', 'Canada': 'North_America', 'Mexico': 'North_America',
        'China': 'Asia', 'India': 'Asia', 'Japan': 'Asia', 'Afghanistan': 'Asia',
        'Korea': 'Asia', 'Indonesia': 'Asia', 'Thailand': 'Asia', 'Vietnam': 'Asia',
        'Germany': 'Europe', 'France': 'Europe', 'United_Kingdom': 'Europe', 'Italy': 'Europe',
        'Spain': 'Europe', 'Russia': 'Europe', 'Turkey': 'Europe', 'Poland': 'Europe',
        'Brazil': 'South_America', 'Argentina': 'South_America', 'Chile': 'South_America',
        'Colombia': 'South_America', 'Peru': 'South_America', 'Venezuela': 'South_America',
        'Australia': 'Oceania', 'New_Zealand': 'Oceania',
        'South_Africa': 'Africa', 'Nigeria': 'Africa', 'Egypt': 'Africa', 'Zimbabwe': 'Africa',
        'Kenya': 'Africa', 'Ethiopia': 'Africa', 'Morocco': 'Africa',
        # Additional countries from unmapped list
        'Albania': 'Europe', 'Algeria': 'Africa', 'Austria': 'Europe', 'Barbados': 'North_America',
        'Belgium': 'Europe', 'Bolivia': 'South_America', 'Bosnia_and_Herzegovina': 'Europe',
        'Bulgaria': 'Europe', 'Cabo_Verde': 'Africa', 'Costa_Rica': 'North_America',
        'Croatia': 'Europe', 'Cyprus': 'Europe', 'Czech_Republic': 'Europe', 'Denmark': 'Europe',
        'Dominican_Republic': 'North_America', 'Estonia': 'Europe', 'Finland': 'Europe',
        'Hungary': 'Europe', 'Iceland': 'Europe', 'Ireland': 'Europe',
        'Islamic_Republic_of_Iran': 'Asia', 'Israel': 'Asia', 'Jordan': 'Asia',
        'Kazakhstan': 'Asia', 'Latvia': 'Europe', 'Lebanon': 'Asia', 'Lithuania': 'Europe',
        'Luxembourg': 'Europe', 'Malta': 'Europe', 'Netherlands': 'Europe',
        'North_Macedonia': 'Europe', 'Norway': 'Europe', 'Pakistan': 'Asia',
        'Panama': 'North_America', 'Paraguay': 'South_America', 'Portugal': 'Europe',
        'Romania': 'Europe', 'Saudi_Arabia': 'Asia', 'Serbia': 'Europe', 'Seychelles': 'Africa',
        'Slovak_Republic': 'Europe', 'Slovenia': 'Europe', 'Sweden': 'Europe',
        'Switzerland': 'Europe', 'Syria': 'Asia', 'Taiwan_Province_of_China': 'Asia',
        'Trinidad_and_Tobago': 'North_America', 'Türkiye': 'Europe', 'Uruguay': 'South_America'
    }
    
    def country_to_continent(name):
        return fallback.get(name.replace(' ', '_'), 'Unknown')

# Add Continent column to df_filtered
df_filtered_copy = df_filtered.copy()
df_filtered_copy['Continent'] = df_filtered_copy['Country'].astype(str).apply(country_to_continent)

# Create continent-specific DataFrames as globals
continent_names = sorted(df_filtered_copy['Continent'].dropna().unique())
created = []

for cont in continent_names:
    safe = str(cont).replace(' ', '_')
    df_name = f'df_{safe}'
    subset = df_filtered_copy[df_filtered_copy['Continent'] == cont].copy()
    # Remove the Continent column from the subset
    subset = subset.drop(columns=['Continent'])
    globals()[df_name] = subset
    created.append((cont, df_name, len(subset)))

# Print summary
print('Created continent-specific DataFrames:')
for cont, var_name, rows in created:
    print(f' - {cont}: {var_name} (rows: {rows})')

In [None]:
df_filtered = df_pivot.loc[df_pivot.index <= 2024]
df_filtered

# Assuming df_filtered is your dataframe with Year as index
X = df_filtered.drop(columns=["Global_Recession", "Country"])  # features
y = df_filtered["Global_Recession"]                           # target


In [None]:
# --- 3. Exploratory Plots ---
# Heatmap
plt.figure(figsize=(10,8))
corr = df_filtered.drop(columns=["Country"]).corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap of Economic Indicators")
plt.show()

In [None]:
# Model imports (data/numeric/visual imports are at the top of the notebook)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Accuracy
train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

# Logistic Regression coefficients
coefficients = model.coef_[0]
features = X_train.columns

logit_importance = pd.DataFrame({
    "Feature": features,
    "Coefficient": coefficients,
    "Abs_Importance": np.abs(coefficients)
}).sort_values(by="Abs_Importance", ascending=False)

print("Logistic Regression Feature Importance:")
print(logit_importance)

# Plot feature importance
plt.figure(figsize=(8,6))
plt.barh(logit_importance["Feature"], logit_importance["Coefficient"])
plt.xlabel("Coefficient (Impact on Log-Odds)")
plt.title("Logistic Regression Feature Importance")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Accuracy
train_acc = accuracy_score(y_train, rf.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

# Random Forest feature importance
rf_importance = pd.DataFrame({
    "Feature": features,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Random Forest Feature Importance:")
print(rf_importance)

# Plot feature importance
plt.figure(figsize=(8,6))
plt.barh(rf_importance["Feature"], rf_importance["Importance"])
plt.xlabel("Importance (Mean Decrease in Impurity)")
plt.title("Random Forest Feature Importance")
plt.show()


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

# Predictions
y_pred = gb.predict(X_test)

# Classification report
print("Gradient Boosting:\n", classification_report(y_test, y_pred))

# Accuracy
train_acc = accuracy_score(y_train, gb.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

# Feature importance
gb_importance = pd.DataFrame({
    "Feature": features,
    "Importance": gb.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Gradient Boosting Feature Importance:")
print(gb_importance)

# Plot feature importance
plt.figure(figsize=(8,6))
plt.barh(gb_importance["Feature"], gb_importance["Importance"])
plt.xlabel("Importance (Mean Decrease in Impurity)")
plt.title("Gradient Boosting Feature Importance")
plt.show()


In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Train Linear SVM
linear_svm = LinearSVC(random_state=42, max_iter=5000)
linear_svm.fit(X_train, y_train)

# Feature importance (coefficients)
svm_importance = pd.DataFrame({
    "Feature": features,
    "Coefficient": linear_svm.coef_[0],
    "Abs_Importance": np.abs(linear_svm.coef_[0])
}).sort_values(by="Abs_Importance", ascending=False)

print("Linear SVM Feature Importance:")
print(svm_importance)

# Accuracy on train and test sets
train_acc = accuracy_score(y_train, linear_svm.predict(X_train))
test_acc = accuracy_score(y_test, linear_svm.predict(X_test))

print(f"Training Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# --- KNN ---
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("KNN:\n", classification_report(y_test, y_pred))

train_acc = accuracy_score(y_train, knn.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)
print(f"KNN Training Accuracy: {train_acc:.3f}")
print(f"KNN Test Accuracy: {test_acc:.3f}\n")

# --- Naive Bayes ---
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Naive Bayes:\n", classification_report(y_test, y_pred))

train_acc = accuracy_score(y_train, nb.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Training Accuracy: {train_acc:.3f}")
print(f"Naive Bayes Test Accuracy: {test_acc:.3f}\n")

# --- MLP ---
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print("MLP:\n", classification_report(y_test, y_pred))

train_acc = accuracy_score(y_train, mlp.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)
print(f"MLP Training Accuracy: {train_acc:.3f}")
print(f"MLP Test Accuracy: {test_acc:.3f}")


In [None]:
# Compare feature importance for models that support it
fig, axes = plt.subplots(1, 3, figsize=(18,6))

sns.barplot(x="Abs_Importance", y="Feature", data=logit_importance, ax=axes[0])
axes[0].set_title("Logistic Regression Coefficients")

sns.barplot(x="Importance", y="Feature", data=rf_importance, ax=axes[1])
axes[1].set_title("Random Forest Importance")

sns.barplot(x="Importance", y="Feature", data=gb_importance, ax=axes[2])
axes[2].set_title("Gradient Boosting Importance")

plt.tight_layout()
plt.show()


In [None]:
# --- Print accuracies for all models ---
from sklearn.metrics import accuracy_score

# Logistic Regression
logit_train_acc = accuracy_score(y_train, model.predict(X_train))
logit_test_acc = accuracy_score(y_test, model.predict(X_test))

# Random Forest
rf_train_acc = accuracy_score(y_train, rf.predict(X_train))
rf_test_acc = accuracy_score(y_test, rf.predict(X_test))

# Gradient Boosting
gb_train_acc = accuracy_score(y_train, gb.predict(X_train))
gb_test_acc = accuracy_score(y_test, gb.predict(X_test))

# Linear SVM
svm_train_acc = accuracy_score(y_train, linear_svm.predict(X_train))
svm_test_acc = accuracy_score(y_test, linear_svm.predict(X_test))

# KNN
knn_train_acc = accuracy_score(y_train, knn.predict(X_train))
knn_test_acc = accuracy_score(y_test, knn.predict(X_test))

# Naive Bayes
nb_train_acc = accuracy_score(y_train, nb.predict(X_train))
nb_test_acc = accuracy_score(y_test, nb.predict(X_test))

# MLP
mlp_train_acc = accuracy_score(y_train, mlp.predict(X_train))
mlp_test_acc = accuracy_score(y_test, mlp.predict(X_test))

print("Model Accuracies:")
print(f"Logistic Regression - Train: {logit_train_acc:.3f}, Test: {logit_test_acc:.3f}")
print(f"Random Forest       - Train: {rf_train_acc:.3f}, Test: {rf_test_acc:.3f}")
print(f"Gradient Boosting   - Train: {gb_train_acc:.3f}, Test: {gb_test_acc:.3f}")
print(f"Linear SVM          - Train: {svm_train_acc:.3f}, Test: {svm_test_acc:.3f}")
print(f"KNN                 - Train: {knn_train_acc:.3f}, Test: {knn_test_acc:.3f}")
print(f"Naive Bayes         - Train: {nb_train_acc:.3f}, Test: {nb_test_acc:.3f}")
print(f"MLP                 - Train: {mlp_train_acc:.3f}, Test: {mlp_test_acc:.3f}")


## 8) Export

Write the cleaned panel to CSV for downstream analysis or modeling.

### Results and Export

After running the cleaning and modeling sections, inspect `df_pivot`/`df_filtered` outputs (head, tail, non-null counts) before exporting. The following export cell writes `weo_data_clean.csv`.

In [None]:
# Write dataframe to CSV
df_pivot.to_csv("weo_data_clean.csv", index=True)
