In [2]:
#The goal of this example is to predict the GDP per capita of countries based on features such as Birth Rate,
#Co2-Emissions, Unemployment rate, Gross Tertiary education enrollment, and others.
#Countries data taken from https://www.kaggle.com/datasets/nelgiriyewithana/countries-of-the-world-2023
#HDI data taken from https://hdr.undp.org/data-center/human-development-index#/indicies/HDI
#For this, I use LinearRegression from sklearn.  This model will use features of type float.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [3]:
path = ''
file = 'world-data-2023.csv'

df = pd.read_csv(path + file)

#data cleaning
df = df.dropna()
df = df.drop(['Abbreviation', 'Calling Code', 'Capital/Major City', 
              'Largest city', 'Official language', 'Currency-Code'], axis=1)

#change index to country names
df = df.set_index('Country')

#create an array with the names of type object columns
object_columns = df.select_dtypes(include=object).columns.to_numpy()

#iterate trough array and remove , . $ % and convert to float
for i in object_columns:
    df[i] = df[i].str.replace(',', '').str.replace('$', '').str.replace('.', '').str.replace(' ', '').str.replace('%', '').astype(float)

#confirm that all remaining features are float    
print(df.dtypes)

print(df.shape)

  df[i] = df[i].str.replace(',', '').str.replace('$', '').str.replace('.', '').str.replace(' ', '').str.replace('%', '').astype(float)


Density\n(P/Km2)                             float64
Agricultural Land( %)                        float64
Land Area(Km2)                               float64
Armed Forces size                            float64
Birth Rate                                   float64
Co2-Emissions                                float64
CPI                                          float64
CPI Change (%)                               float64
Fertility Rate                               float64
Forested Area (%)                            float64
Gasoline Price                               float64
GDP                                          float64
Gross primary education enrollment (%)       float64
Gross tertiary education enrollment (%)      float64
Infant mortality                             float64
Life expectancy                              float64
Maternal mortality ratio                     float64
Minimum wage                                 float64
Out of pocket health expenditure             f

In [4]:
#Include a new column with GDP Per Capita by dividing GDP by Population
df['GDP Per Capita'] = df['GDP'] / df['Population']

print(df.keys())

Index(['Density\n(P/Km2)', 'Agricultural Land( %)', 'Land Area(Km2)',
       'Armed Forces size', 'Birth Rate', 'Co2-Emissions', 'CPI',
       'CPI Change (%)', 'Fertility Rate', 'Forested Area (%)',
       'Gasoline Price', 'GDP', 'Gross primary education enrollment (%)',
       'Gross tertiary education enrollment (%)', 'Infant mortality',
       'Life expectancy', 'Maternal mortality ratio', 'Minimum wage',
       'Out of pocket health expenditure', 'Physicians per thousand',
       'Population', 'Population: Labor force participation (%)',
       'Tax revenue (%)', 'Total tax rate', 'Unemployment rate',
       'Urban_population', 'Latitude', 'Longitude', 'GDP Per Capita'],
      dtype='object')


In [5]:
#Explore some correlations

c_absLat_GDPPC = abs(df['Latitude']).corr(df['GDP Per Capita'])
c_PhysPT_GDPPC = df['Physicians per thousand'].corr(df['GDP Per Capita'])

print('The correlation between the absolute value of Latitude and GDP Per Capita is: ' + str(c_absLat_GDPPC))
print('The correlation between Physicians per thousand and GDP Per Capita is: ' + str(c_PhysPT_GDPPC))

The correlation between the absolute value of Latitude and GDP Per Capita is: 0.4838009516574867
The correlation between Physicians per thousand and GDP Per Capita is: 0.5097265610156401


In [6]:
#regression with df to predict gdp per capita

X = df.drop(['GDP Per Capita'], axis=1)

# Select the dependent variable
y = df['GDP Per Capita'] 

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print("R-squared score (1 is perfect, 0 is worst possible fit): ", r2)

R-squared score (1 is perfect, 0 is worst possible fit):  0.6390303696603218


In [8]:
coefficients = model.coef_

X_columns = X.columns.to_numpy()

# Create a DataFrame to display the coefficients
coefficients_df = pd.DataFrame({'Variable': X_columns, 'Coefficient': coefficients})

# Sort the coefficients by magnitude
coefficients_df = coefficients_df.reindex(coefficients_df['Coefficient'].abs().sort_values(ascending=False).index)


# Print the coefficients to explore which features are more significant for this prediction
print(coefficients_df)

                                     Variable   Coefficient
8                              Fertility Rate  2.083457e+03
19                    Physicians per thousand  9.735764e+02
4                                  Birth Rate -3.743299e+02
15                            Life expectancy -1.361187e+02
14                           Infant mortality -4.648947e+01
17                               Minimum wage  2.963839e+01
10                             Gasoline Price -2.558868e+01
27                                  Longitude -9.874740e+00
0                            Density\n(P/Km2)  6.889264e+00
26                                   Latitude -5.916829e+00
22                            Tax revenue (%) -1.746001e+00
16                   Maternal mortality ratio  1.467526e+00
24                          Unemployment rate -1.019335e+00
13    Gross tertiary education enrollment (%)  7.212990e-01
18           Out of pocket health expenditure -6.099957e-01
1                       Agricultural Lan