In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import data_cleaning
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association

## World Data 2023 DataFrame

In [2]:
data = pd.read_csv('../dataset/world-data-2023.csv')
df = data.copy()

# I want all the columns to be displayed
pd.set_option('display.max_columns', None)
df.head(10)

Unnamed: 0,Country,Density\n(P/Km2),Abbreviation,Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Capital/Major City,Co2-Emissions,CPI,CPI Change (%),Currency-Code,Fertility Rate,Forested Area (%),Gasoline Price,GDP,Gross primary education enrollment (%),Gross tertiary education enrollment (%),Infant mortality,Largest city,Life expectancy,Maternal mortality ratio,Minimum wage,Official language,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,Afghanistan,60,AF,58.10%,652230,323000.0,32.49,93.0,Kabul,8672,149.9,2.30%,AFN,4.47,2.10%,$0.70,"$19,101,353,833",104.00%,9.70%,47.9,Kabul,64.5,638.0,$0.43,Pashto,78.40%,0.28,38041754,48.90%,9.30%,71.40%,11.12%,9797273,33.93911,67.709953
1,Albania,105,AL,43.10%,28748,9000.0,11.78,355.0,Tirana,4536,119.05,1.40%,ALL,1.62,28.10%,$1.36,"$15,278,077,447",107.00%,55.00%,7.8,Tirana,78.5,15.0,$1.12,Albanian,56.90%,1.2,2854191,55.70%,18.60%,36.60%,12.33%,1747593,41.153332,20.168331
2,Algeria,18,DZ,17.40%,2381741,317000.0,24.28,213.0,Algiers,150006,151.36,2.00%,DZD,3.02,0.80%,$0.28,"$169,988,236,398",109.90%,51.40%,20.1,Algiers,76.7,112.0,$0.95,Arabic,28.10%,1.72,43053054,41.20%,37.20%,66.10%,11.70%,31510100,28.033886,1.659626
3,Andorra,164,AD,40.00%,468,,7.2,376.0,Andorra la Vella,469,,,EUR,1.27,34.00%,$1.51,"$3,154,057,987",106.40%,,2.7,Andorra la Vella,,,$6.63,Catalan,36.40%,3.33,77142,,,,,67873,42.506285,1.521801
4,Angola,26,AO,47.50%,1246700,117000.0,40.73,244.0,Luanda,34693,261.73,17.10%,AOA,5.52,46.30%,$0.97,"$94,635,415,870",113.50%,9.30%,51.6,Luanda,60.8,241.0,$0.71,Portuguese,33.40%,0.21,31825295,77.50%,9.20%,49.10%,6.89%,21061025,-11.202692,17.873887
5,Antigua and Barbuda,223,AG,20.50%,443,0.0,15.33,1.0,"St. John's, Saint John",557,113.81,1.20%,XCD,1.99,22.30%,$0.99,"$1,727,759,259",105.00%,24.80%,5.0,"St. John's, Saint John",76.9,42.0,$3.04,English,24.30%,2.76,97118,,16.50%,43.00%,,23800,17.060816,-61.796428
6,Argentina,17,AR,54.30%,2780400,105000.0,17.02,54.0,Buenos Aires,201348,232.75,53.50%,ARS,2.26,9.80%,$1.10,"$449,663,446,954",109.70%,90.00%,8.8,Buenos Aires,76.5,39.0,$3.35,Spanish,17.60%,3.96,44938712,61.30%,10.10%,106.30%,9.79%,41339571,-38.416097,-63.616672
7,Armenia,104,AM,58.90%,29743,49000.0,13.99,374.0,Yerevan,5156,129.18,1.40%,AMD,1.76,11.70%,$0.77,"$13,672,802,158",92.70%,54.60%,11.0,Yerevan,74.9,26.0,$0.66,Armenian,81.60%,4.4,2957731,55.60%,20.90%,22.60%,16.99%,1869848,40.069099,45.038189
8,Australia,3,AU,48.20%,7741220,58000.0,12.6,61.0,Canberra,375908,119.8,1.60%,AUD,1.74,16.30%,$0.93,"$1,392,680,589,329",100.30%,113.10%,3.1,Sydney,82.7,6.0,$13.59,,19.60%,3.68,25766605,65.50%,23.00%,47.40%,5.27%,21844756,-25.274398,133.775136
9,Austria,109,AT,32.40%,83871,21000.0,9.7,43.0,Vienna,61448,118.06,1.50%,EUR,1.47,46.90%,$1.20,"$446,314,739,528",103.10%,85.10%,2.9,Vienna,81.6,5.0,,German,17.90%,5.17,8877067,60.70%,25.40%,51.40%,4.67%,5194416,47.516231,14.550072


## Socioeconomic and Environmental data

This project involves the analysis of various socioeconomic and environmental factors across differente countries. The goal is to explore and uncover meaningful insights regarding how factors like GDP, population density, health indicators, CO2 emissions, and more impact a country's development and overall well-being.

### Key Features:
- **Country Characteristics**: information about the country (`Country`, `Abbreviation`, `Capital/Major City`).
- **Country Details**: calling number, currency code and official language (`Calling Code`, `Capital/Major City`, `Official Language`).
- **Population Details**: population related details for each country, like density, largest city by population, etc (`Density (P/Km²)`, `Largest City`, `Population`, `Urban Population`, `Birth Rate`, etc.).
- **Health Indicators**: health factors (`Fertility Rate`, `Infant Mortality`, `Life Expectancy`, `Maternal Mortality Ratio`, etc.).
- **Environmental Factors**: environmental factors for each country (`Agricultural Land (%)`, `CO2 Emissions`, `Forested Area (%)`, etc.).
- **Economic Indicators**: Details about inflation, price of gasolina, unemployment rate, etc.(`CPI (Consumer Price Index)`, `CPI Change (%)`, `Gasoline Price`, `GDP`, `Minimum Wage`, `Tax Revenue (% of GDP)`, `Total Tax Rate`, `Unemployment Rate`, etc.).
- **Education Details**: details related to the level of education (`Gross Primary Education Enrollment (%)`, `Gross Tertiary Education Enrollment (%**)`, etc.).
- **Country's location**: latitude and longitude (`Latitude`, `Longitude`, `Land Area (Km²)`, etc.).

### Summary:
- **Total Features**: 34 variables.
- **Target Variable**: `C02 emissions` 

In [3]:
df.shape

(195, 35)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 35 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Country                                    195 non-null    object 
 1   Density
(P/Km2)                            195 non-null    object 
 2   Abbreviation                               188 non-null    object 
 3   Agricultural Land( %)                      188 non-null    object 
 4   Land Area(Km2)                             194 non-null    object 
 5   Armed Forces size                          171 non-null    object 
 6   Birth Rate                                 189 non-null    float64
 7   Calling Code                               194 non-null    float64
 8   Capital/Major City                         192 non-null    object 
 9   Co2-Emissions                              188 non-null    object 
 10  CPI                       

<h2 style="color: #008080;">Data Cleaning</h2>

In [5]:
# snake-case
df = data_cleaning.to_snake_case(df)
df = data_cleaning.replace_hyphen(df)

In [6]:
# Rename two columns
df = df.rename(columns = {"density\n(p/km2)":"density_(p/km2)", "agricultural_land(_%)": "agricultural_land(%)"})

In [7]:
df.head(1)

Unnamed: 0,country,density_(p/km2),abbreviation,agricultural_land(%),land_area(km2),armed_forces_size,birth_rate,calling_code,capital/major_city,co2_emissions,cpi,cpi_change_(%),currency_code,fertility_rate,forested_area_(%),gasoline_price,gdp,gross_primary_education_enrollment_(%),gross_tertiary_education_enrollment_(%),infant_mortality,largest_city,life_expectancy,maternal_mortality_ratio,minimum_wage,official_language,out_of_pocket_health_expenditure,physicians_per_thousand,population,population:_labor_force_participation_(%),tax_revenue_(%),total_tax_rate,unemployment_rate,urban_population,latitude,longitude
0,Afghanistan,60,AF,58.10%,652230,323000,32.49,93.0,Kabul,8672,149.9,2.30%,AFN,4.47,2.10%,$0.70,"$19,101,353,833",104.00%,9.70%,47.9,Kabul,64.5,638.0,$0.43,Pashto,78.40%,0.28,38041754,48.90%,9.30%,71.40%,11.12%,9797273,33.93911,67.709953


In [8]:
# NaNs 
df.isna().sum()

country                                       0
density_(p/km2)                               0
abbreviation                                  7
agricultural_land(%)                          7
land_area(km2)                                1
armed_forces_size                            24
birth_rate                                    6
calling_code                                  1
capital/major_city                            3
co2_emissions                                 7
cpi                                          17
cpi_change_(%)                               16
currency_code                                15
fertility_rate                                7
forested_area_(%)                             7
gasoline_price                               20
gdp                                           2
gross_primary_education_enrollment_(%)        7
gross_tertiary_education_enrollment_(%)      12
infant_mortality                              6
largest_city                            

In [9]:
# Duplicates
df.duplicated().sum()

0

In [10]:
# Checking for empty spaces
df.eq(" ").sum()

country                                      0
density_(p/km2)                              0
abbreviation                                 0
agricultural_land(%)                         0
land_area(km2)                               0
armed_forces_size                            0
birth_rate                                   0
calling_code                                 0
capital/major_city                           0
co2_emissions                                0
cpi                                          0
cpi_change_(%)                               0
currency_code                                0
fertility_rate                               0
forested_area_(%)                            0
gasoline_price                               0
gdp                                          0
gross_primary_education_enrollment_(%)       0
gross_tertiary_education_enrollment_(%)      0
infant_mortality                             0
largest_city                                 0
life_expectan

<h2 style="color: #008080;">Data Preprocessing/Formatting</h2>

In [11]:
# customer lifetime value
# I create a lambda function to replace % in Customer Lifetime Value column
replace_ = lambda x: (str(x)).replace("%", "")
# Then apply the function to the column with apply
df["customer_lifetime_value"] = df["customer_lifetime_value"].apply(replace_)
df.head(5)

KeyError: 'customer_lifetime_value'

In [None]:
df.nunique()

In [None]:
df["gender"].unique()

In [None]:
# # removing id column
# df = df.drop(columns=["Id"])

# # General Selection
# cat = df.select_dtypes(exclude="number")
# num = df.select_dtypes(include="number")

# # Being specific
# cat_from_num = num.loc[:, num.nunique() < 20] # how does loc work Vs iloc
# cat = pd.concat([cat, cat_from_num], axis=1)

# num = num.drop(columns=cat_from_num.columns)

<h2 style="color: #008080;">Exploratory Data Analysis (EDA)</h2>

<h2 style="color: #008080;">Statistical Analysis</h2>