In [99]:
#Countries data taken from https://www.kaggle.com/datasets/nelgiriyewithana/countries-of-the-world-2023
#HDI data taken from https://hdr.undp.org/data-center/human-development-index#/indicies/HDI

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [100]:
path = ''
file = 'world-data-2023.csv'

df = pd.read_csv(path + file)

#print(df.columns) #want to know what columns to drop

df = df.drop(['Abbreviation', 'Calling Code', 'Capital/Major City', 
              'Largest city', 'Official language', 'Currency-Code'], axis=1)

#change index to country names
df = df.set_index('Country')

#create an array with the names of type object columns
object_columns = df.select_dtypes(include=object).columns.to_numpy()

#iterate trough array and remove , . $ % and convert to float
for i in object_columns:
    df[i] = df[i].str.replace(',', '').str.replace('$', '').str.replace('.', '').str.replace(' ', '').str.replace('%', '').astype(float)
    
print(df.dtypes)

  df[i] = df[i].str.replace(',', '').str.replace('$', '').str.replace('.', '').str.replace(' ', '').str.replace('%', '').astype(float)


Density\n(P/Km2)                             float64
Agricultural Land( %)                        float64
Land Area(Km2)                               float64
Armed Forces size                            float64
Birth Rate                                   float64
Co2-Emissions                                float64
CPI                                          float64
CPI Change (%)                               float64
Fertility Rate                               float64
Forested Area (%)                            float64
Gasoline Price                               float64
GDP                                          float64
Gross primary education enrollment (%)       float64
Gross tertiary education enrollment (%)      float64
Infant mortality                             float64
Life expectancy                              float64
Maternal mortality ratio                     float64
Minimum wage                                 float64
Out of pocket health expenditure             f

In [101]:
#Include a new column with GDP Per Capita by dividing GDP by Population
df['GDP Per Capita'] = df['GDP'] / df['Population']

print(df.dtypes)

Density\n(P/Km2)                             float64
Agricultural Land( %)                        float64
Land Area(Km2)                               float64
Armed Forces size                            float64
Birth Rate                                   float64
Co2-Emissions                                float64
CPI                                          float64
CPI Change (%)                               float64
Fertility Rate                               float64
Forested Area (%)                            float64
Gasoline Price                               float64
GDP                                          float64
Gross primary education enrollment (%)       float64
Gross tertiary education enrollment (%)      float64
Infant mortality                             float64
Life expectancy                              float64
Maternal mortality ratio                     float64
Minimum wage                                 float64
Out of pocket health expenditure             f

In [105]:
#Explore some correlations

c_absLat_GDPPC = abs(df['Latitude']).corr(df['GDP Per Capita'])
c_PhysPT_GDPPC = df['Physicians per thousand'].corr(df['GDP Per Capita'])

print('The correlation between the absolute value of Latitude and GDP Per Capita is: ' + str(c_absLat_GDPPC))
print('The correlation between Physicians per thousand and GDP Per Capita is: ' + str(c_PhysPT_GDPPC))

The correlation between the absolute value of Latitude and GDP Per Capita is: 0.5183809179889401
The correlation between Physicians per thousand and GDP Per Capita is: 0.5207015468100477


In [121]:
#summmon csv with HDI data

path2 = ''
file2 = 'HDR21-22_Statistical_Annex_HDI_Table.csv'

df2 = pd.read_csv(path2 + file2)

#change index to country names
df2 = df2.set_index('Country')

df2 = df2.dropna()

# Sort the DataFrame alphabetically by index name
df2 = df2.sort_index()

df2.head()

Unnamed: 0_level_0,Human Development Index 2021 (HDI),Life expectancy at birth 2021,Expected years of schooling 2021,Mean years of schooling 2021,Gross national income (GNI) per capita 2021 (2017 PPP $),GNI per capita rank minus HDI rank 2021,HDI rank 2020,HDI Category 2021
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,0.478,62.0,10.3,3.0,1824,-2.0,177.0,Low
Albania,0.796,76.5,14.4,11.3,14131,17.0,68.0,High
Algeria,0.745,76.4,14.6,8.1,10800,13.0,96.0,High
Andorra,0.858,80.4,13.3,10.6,51167,-19.0,45.0,Very High
Angola,0.586,61.6,12.2,5.4,5466,-14.0,149.0,Medium


In [137]:
#alternative 1: regression with df to predict gdp per capita

#print(df.loc['Angola']) #print row of df

df = df.dropna()

X = df.drop(['GDP Per Capita'], axis=1)

# Select the dependent variable
y = df['GDP Per Capita'] 

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the R-squared score
print("R-squared score (1 is perfect, 0 is worst possible fit): ", r2)
print("RMSE (0 is perfect):", rmse)

coefficients = model.coef_

X_columns = X.columns.to_numpy()

# Create a DataFrame to display the coefficients
coefficients_df = pd.DataFrame({'Variable': X_columns, 'Coefficient': coefficients})

# Sort the coefficients by magnitude
coefficients_df = coefficients_df.reindex(coefficients_df['Coefficient'].abs().sort_values(ascending=False).index)


# Print the coefficients
print(coefficients_df)

R-squared score (1 is perfect, 0 is worst possible fit):  0.7628351485878619
RMSE (0 is perfect): 11671.187447547336
                                     Variable   Coefficient
19                    Physicians per thousand  1.258954e+03
15                            Life expectancy -3.396516e+02
8                              Fertility Rate -3.065134e+02
14                           Infant mortality -7.178809e+01
17                               Minimum wage  4.225033e+01
26                                   Latitude  2.043454e+01
27                                  Longitude -1.984909e+01
4                                  Birth Rate -1.968624e+01
10                             Gasoline Price -1.416398e+01
0                            Density\n(P/Km2)  5.631897e+00
22                            Tax revenue (%) -2.398626e+00
7                              CPI Change (%) -1.225381e+00
21  Population: Labor force participation (%)  8.241524e-01
18           Out of pocket health expenditu

In [131]:
#alternative 2: regression with df and df2 to predict HDI

# Get the different indexes between the two DataFrames
diff_indexes = list(set(df.index) ^ set(df2.index))

print('There are ', len(diff_indexes), 'different indexes, but ', abs(df.shape[0] - df2.shape[0]), 'rows of difference. There are', len(diff_indexes) - abs(df.shape[0] - df2.shape[0]), 'different country names.')

There are  107 different indexes, but  71 rows of difference. There are 36 different country names.


In [135]:
# Create an empty file named 'index.txt'
with open('index.txt', 'w'):
    pass


# Save the indexes to a TXT file
df.index.to_series().to_csv('index.txt', index=True, header=False)