Import necessary libraries


In [2]:
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

LOAD DATA

In [3]:
#Step 1: Load Data
data = pd.read_excel('Refined/CleanData.xlsx')
print(data.head())

   Year      Total       Male     Female    Ratio  Median Age  Increase Rate  \
0  1960  45954.226  24795.178  21159.049  117.185      18.429          2.485   
1  1961  47060.915  25363.721  21697.194  116.899      18.360          2.277   
2  1962  48161.841  25930.189  22231.652  116.636      18.271          2.347   
3  1963  49325.050  26526.519  22798.530  116.352      18.183          2.425   
4  1964  50552.592  27153.709  23398.883  116.047      18.084          2.490   

   Life Expectancy  Inflation  Unemployment           GDP  Homicides  
0           43.355   6.947368           0.4  3.749265e+09   3.651252  
1           44.180   1.640420           0.4  4.118648e+09   3.651252  
2           45.009  -0.516462           0.4  4.310164e+09   3.651252  
3           46.318   1.456488           0.4  4.630827e+09   3.651252  
4           47.360   4.179587           0.4  5.204956e+09   3.651252  


population 

In [4]:
X = data['Year'].values.reshape(-1, 1)
y = data['Total'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

population_model = LinearRegression()
population_model.fit(X_train, y_train)

y_pred = population_model.predict(X_test)

print("Accuracy: ", population_model.score(X_test, y_test).round(5) * 100, "%")

Accuracy:  98.433 %


gdp

In [5]:
X = data['Year'].values.reshape(-1, 1)
y = np.log(data['GDP'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

gdp_model = LinearRegression()
gdp_model.fit(X_train_poly, y_train)

print("Accuracy:", gdp_model.score(X_test_poly, y_test).round(5) * 100, "%")

Accuracy: 99.612 %


unemployment

In [6]:
data = pd.read_excel('Refined/CleanData.xlsx')
data = data[data['Year'] >= 1990]

X = data['Year'].values.reshape(-1, 1)
y = np.log(data['Unemployment'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

unemployment_model = LinearRegression()
unemployment_model.fit(X_train_poly, y_train)

print("Accuracy", unemployment_model.score(X_test_poly, y_test).round(5) * 100, "%")


Accuracy 93.377 %


inflation

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load data
data = pd.read_excel('Refined/CleanData.xlsx')

# Filter data for relevant years
data = data[data['Year'] >= 1990]

# Prepare data
features = ['Year', 'Total', 'Male', 'Female', 'Ratio', 'Median Age', 'Increase Rate', 'Life Expectancy']
X = data[features].values
y = data['Inflation'].values

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Gradient Boosting Regressor with Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
gb_model = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(gb_model, param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

inflation_model = grid_search.best_estimator_
gb_scores = cross_val_score(inflation_model, X_train, y_train, cv=TimeSeriesSplit(n_splits=5), scoring='neg_mean_squared_error')
inflation_model.fit(X_train, y_train)
y_pred_gb = inflation_model.predict(X_test)

# Calculate RMSE
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))

print(f"Gradient Boosting Regressor Test Set RMSE: {rmse_gb:.2f}")

# Predict inflation for the year 2025 using the best model
latest_data = data[features].iloc[-1].copy()
latest_data['Year'] = 2025
Year_2025 = scaler.transform([latest_data.values])
predicted_inflation_2025 = inflation_model.predict(Year_2025)
print(f"Predicted Inflation in 2025: {predicted_inflation_2025[0]:.2f}")


Gradient Boosting Regressor Test Set RMSE: 1.75
Predicted Inflation in 2025: 9.34


homicides

In [8]:
# Load and filter data
data = pd.read_excel('Refined/CleanData.xlsx')
data = data[data['Year'] >= 1990]

# Feature and target variables
X = data['Year'].values.reshape(-1, 1)
y = np.log(data['Homicides'].values)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial features
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Model training
homicides_model = RandomForestRegressor(n_estimators=100, random_state=42)
homicides_model.fit(X_train_poly, y_train)

# Model evaluation
print("Accuracy:", homicides_model.score(X_test_poly, y_test).round(5) * 100, "%")


Accuracy: 95.883 %


final prediction

In [9]:
# Ask for the input year
Year = int(input("Enter the year for prediction: "))
features_for_prediction = [[Year]]

# Predict population
predictions = population_model.predict(features_for_prediction)
print(f"Predicted Population in {Year}: {int(predictions[0])}")

# Predict unemployment
predictions = np.exp(unemployment_model.predict(poly_features.transform(features_for_prediction)))
print(f"Predicted Unemployment in {Year}: {int(predictions[0])}")

# Predict GDP
predictions = np.exp(gdp_model.predict(poly_features.transform(features_for_prediction)))
print(f"Predicted GDP in {Year}: {int(predictions[0])}")

# Predict homicides
predictions = np.exp(homicides_model.predict(poly_features.transform(features_for_prediction)))
print(f"Predicted Homicides in {Year}: {predictions[0]:.2f}")


Predicted Population in 2025: 237210
Predicted Unemployment in 2025: 21
Predicted GDP in 2025: 526597842345
Predicted Homicides in 2025: 3.93
