# Machine Learning Analysis
This notebook contains the machine learning analysis for the project.
It covers:
1. Regression
2. Classification
3. Unsupervised Learning (Clustering)
4. Time Series Analysis
5. Dimensionality Reduction (PCA)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Required Machine Learning Libraries (from slides)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# 1. DATA PREPARATION
# =================
print("Loading and cleaning data...")
# Load data
gdp_df = pd.read_csv('API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_130128.csv', skiprows=4)
co2_df = pd.read_csv('owid-co2-data (1).csv')

# Organize GDP data (Years to rows)
gdp_clean = gdp_df.drop(columns=['Indicator Name', 'Indicator Code', 'Unnamed: 69'], errors='ignore')
gdp_long = gdp_clean.melt(id_vars=['Country Name', 'Country Code'], 
                          var_name='year', 
                          value_name='gdp_per_capita')
gdp_long['year'] = pd.to_numeric(gdp_long['year'], errors='coerce')
gdp_long = gdp_long.dropna(subset=['year', 'gdp_per_capita'])
gdp_long['year'] = gdp_long['year'].astype(int)

# Organize CO2 data
co2_clean = co2_df[['iso_code', 'country', 'year', 'co2_per_capita', 'population']]
co2_clean = co2_clean.dropna(subset=['co2_per_capita', 'iso_code'])

# Merge two tables
data = pd.merge(gdp_long, co2_clean, left_on=['Country Code', 'year'], right_on=['iso_code', 'year'])
data = data.dropna() # Drop rows with missing data
print(f"Data ready. Total Rows: {data.shape[0]}")

In [None]:
# 2. REGRESSION ANALYSIS (Predicting CO2) - (Week 8 & 9)
# ====================================================
print("\n--- 2. Regression Analysis (Week 8 & 9) ---")
# Goal: Predict CO2 using GDP and Population
X_reg = data[['gdp_per_capita', 'population', 'year']]
y_reg = data['co2_per_capita']

# Split into Training and Test sets (80% Train, 20% Test)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Model A: Linear Regression (Week 8)
lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
y_pred_lin = lin_reg.predict(X_test_reg)
print(f"Linear Regression R2 Score: {r2_score(y_test_reg, y_pred_lin):.4f}")

# Model B: Polynomial Regression (For EKC Hypothesis) (Week 8)
poly = PolynomialFeatures(degree=2)
# We test the relationship by checking the square of GDP
X_poly_train = poly.fit_transform(X_train_reg[['gdp_per_capita']]) 
X_poly_test = poly.transform(X_test_reg[['gdp_per_capita']])
poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_train_reg)
y_pred_poly = poly_reg.predict(X_poly_test)
print(f"Polynomial Regression (EKC) R2 Score: {r2_score(y_test_reg, y_pred_poly):.4f}")

# Model C: Random Forest Regressor (Week 9)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_reg, y_train_reg)
y_pred_rf = rf_reg.predict(X_test_reg)
print(f"Random Forest Regressor R2 Score: {r2_score(y_test_reg, y_pred_rf):.4f}")

# Visualize Regression Results
plt.figure(figsize=(10, 5))
plt.scatter(y_test_reg, y_pred_rf, alpha=0.5, color='green', label='Predictions')
plt.plot([0, y_test_reg.max()], [0, y_test_reg.max()], 'r--', label='Perfect Prediction')
plt.xlabel('Actual CO2')
plt.ylabel('Predicted CO2 (Random Forest)')
plt.title('Regression Results: Actual vs Predicted CO2 Emissions')
plt.legend()
plt.show()

In [None]:
# 3. CLASSIFICATION ANALYSIS (Week 8 & 9)
# ================================================
print("\n--- 3. Classification Analysis (Week 8 & 9) ---")
# Goal: Is the country "High Emission"? (Above the median?)
threshold = data['co2_per_capita'].median()
data['is_high_emitter'] = (data['co2_per_capita'] > threshold).astype(int)

X_clf = data[['gdp_per_capita', 'population']]
y_clf = data['is_high_emitter']

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

# Scale data (Required for Logistic Regression and KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clf)
X_test_scaled = scaler.transform(X_test_clf)

# Model A: Logistic Regression (Week 8)
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train_clf)
print(f"Logistic Regression Accuracy: {log_reg.score(X_test_scaled, y_test_clf):.4f}")

# Model B: KNN (K-Nearest Neighbors) (Week 9)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train_clf)
print(f"KNN Accuracy: {knn.score(X_test_scaled, y_test_clf):.4f}")

# Model C: Random Forest Classifier (Week 9)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_clf, y_train_clf)
print(f"Random Forest Classifier Accuracy: {rf_clf.score(X_test_clf, y_test_clf):.4f}")

# Visualize Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test_clf, rf_clf.predict(X_test_clf)), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Random Forest Classification)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# 4. UNSUPERVISED LEARNING (Clustering) - (Week 10)
# ===============================================
print("\n--- 4. Unsupervised Learning (Week 10) ---")
# Group countries using 2019 data
data_2019 = data[data['year'] == 2019].copy()
X_cluster = data_2019[['gdp_per_capita', 'co2_per_capita']]

# Scaling (Important for Clustering)
scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

# K-Means Algorithm
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
data_2019['Cluster'] = kmeans.fit_predict(X_cluster_scaled)

# Visualize Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data_2019, x='gdp_per_capita', y='co2_per_capita', hue='Cluster', palette='viridis', s=100)
plt.xscale('log') # Logarithmic scale shows clearer results
plt.yscale('log')
plt.title('K-Means Clustering: Country Groups based on GDP & CO2')
plt.xlabel('GDP per Capita (Log Scale)')
plt.ylabel('CO2 per Capita (Log Scale)')
plt.show()

In [None]:
# 5. TIME SERIES FORECASTING (Week 13)
# =================================================
print("\n--- 5. Time Series Analysis (Week 13) ---")
# Predict CO2 for USA for the next 5 years
usa_data = data[data['Country Code'] == 'USA'].sort_values('year')
# Set year as index
ts_data = usa_data.set_index('year')['co2_per_capita']

# ARIMA Model (Autoregressive Integrated Moving Average)
model_arima = ARIMA(ts_data, order=(1,1,1)) # Simple (1,1,1) model
model_fit = model_arima.fit()

# Forecast next 5 years
forecast = model_fit.forecast(steps=5)
print("USA CO2 Forecast (Next 5 Years):")
print(forecast)

# Visualize Forecast
plt.figure(figsize=(10, 5))
plt.plot(ts_data.index, ts_data.values, label='Historical Data (1990-2020)')
plt.plot(forecast.index, forecast.values, label='Forecast (Next 5 Years)', color='red', linestyle='--')
plt.title('Time Series Forecasting: USA CO2 Emissions')
plt.xlabel('Year')
plt.ylabel('CO2 per Capita')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# 6. DIMENSIONALITY REDUCTION (PCA)
# =================================
print("\n--- 6. Principal Component Analysis (PCA) ---")
# Use the same 2019 data
X_pca = data_2019[['gdp_per_capita', 'population', 'co2_per_capita']]

# Scale data
scaler_pca = StandardScaler()
X_pca_scaled = scaler_pca.fit_transform(X_pca)

# Apply PCA (Reduce to 2 components)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_pca_scaled)

# Create DataFrame for visualization
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
# Use clusters from previous step if available, otherwise just plot
if 'Cluster' in data_2019.columns:
    pca_df['Cluster'] = data_2019['Cluster'].values
    hue_col = 'Cluster'
else:
    hue_col = None

# Visualize PCA
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue=hue_col, palette='viridis', s=100)
plt.title('PCA: 2D Projection of Country Data')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2f} variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2f} variance)')
plt.show()