In [1]:
import pandas as pd

# Load the data

In [2]:
df_apples = pd.read_csv("../data/raw/fruit-consumption-by-fruit-type.csv")
df_visits = pd.read_csv("../data/raw/doctor-visits.csv")
df_gdp = pd.read_csv("../data/raw/gdp-per-capita-worldbank.csv")
df_age = pd.read_csv("../data/raw/median-age.csv")

# Data Processing

## Doctors Visits

In [3]:
# For visibility, display how the dataframe looks at the beginning
df_visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2652 entries, 0 to 2651
Data columns (total 54 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   STRUCTURE             2652 non-null   object 
 1   STRUCTURE_ID          2652 non-null   object 
 2   STRUCTURE_NAME        2652 non-null   object 
 3   ACTION                2652 non-null   object 
 4   REF_AREA              2652 non-null   object 
 5   Reference area        2652 non-null   object 
 6   MEASURE               2652 non-null   object 
 7   Measure               2652 non-null   object 
 8   UNIT_MEASURE          2652 non-null   object 
 9   Unit of measure       2652 non-null   object 
 10  MEDICAL_PROCEDURE     2652 non-null   object 
 11  Medical procedure     2652 non-null   object 
 12  OCCUPATION            2652 non-null   object 
 13  Occupation            2652 non-null   object 
 14  DIAGNOSTIC_TYPE       2652 non-null   object 
 15  Diagnostic category  

In [4]:
# Select only in person visits
df_visits = df_visits[df_visits["Consultation type"] == "In-person"]

df_visits = df_visits.rename(
    columns={
        "OBS_VALUE": "doctor_visits_per_capita",
        "Reference area": "country",
        "TIME_PERIOD": "year",
    }
)

In [5]:
# Summing dentists and medical doctors (they both count as a visit)
df_visits = df_visits.groupby(["country", "year"])["doctor_visits_per_capita"].sum().reset_index()

In [6]:
# Rename some countries to ensure consistent and homogeneous naming.
# Assumption: 'Korea' refers to 'South Korea' due to data unavailability for North Korea.
df_visits["country"] = df_visits["country"].replace({
    "Korea": "South Korea",
    "Slovak Republic": "Slovakia",
    "Türkiye": "Turkey"
})

In [7]:
# For visibility, display how the dataframe looks at the end
df_visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1515 entries, 0 to 1514
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   1515 non-null   object 
 1   year                      1515 non-null   int64  
 2   doctor_visits_per_capita  1515 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 35.6+ KB


## Apple Consumption

In [8]:
# For visibility, display how the dataframe looks at the beginning
df_apples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12592 entries, 0 to 12591
Data columns (total 14 columns):
 #   Column                                                                                                                  Non-Null Count  Dtype  
---  ------                                                                                                                  --------------  -----  
 0   Entity                                                                                                                  12592 non-null  object 
 1   Code                                                                                                                    9973 non-null   object 
 2   Year                                                                                                                    12592 non-null  int64  
 3   Dates | 00002619 || Food available for consumption | 0645pc || kilograms per year per capita                            11505 non-null  float

In [9]:
# Select only apples, that's what we are interested in
df_apples = df_apples.rename(
    columns={
        "Apples | 00002617 || Food available for consumption | 0645pc || kilograms per year per capita": 
        "apple_consumption_per_capita_kg",
        "Entity": "country",
        "Year": "year",
    }
)[["apple_consumption_per_capita_kg", "country", "year"]]

In [10]:
# For visibility, display how the dataframe looks at the end
df_apples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12592 entries, 0 to 12591
Data columns (total 3 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   apple_consumption_per_capita_kg  12592 non-null  float64
 1   country                          12592 non-null  object 
 2   year                             12592 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 295.3+ KB


## GDP

In [11]:
# For visibility, display how the dataframe looks at the beginning
df_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6562 entries, 0 to 6561
Data columns (total 4 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Entity                                               6562 non-null   object 
 1   Code                                                 6133 non-null   object 
 2   Year                                                 6562 non-null   int64  
 3   GDP per capita, PPP (constant 2017 international $)  6562 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 205.2+ KB


In [12]:
# Renaming to have homogenous column names
df_gdp = df_gdp.rename(
    columns={
        "GDP per capita, PPP (constant 2017 international $)": 
        "gdp_per_capita",
        "Entity": "country",
        "Year": "year",
    }
)[["gdp_per_capita", "country", "year"]]

In [13]:
df_gdp["gdp_per_capita_k"] = df_gdp["gdp_per_capita"] / 1000
df_gdp = df_gdp.drop(columns="gdp_per_capita")

In [14]:
# For visibility, display how the dataframe looks at the end
df_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6562 entries, 0 to 6561
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country           6562 non-null   object 
 1   year              6562 non-null   int64  
 2   gdp_per_capita_k  6562 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 153.9+ KB


## Median Age

In [15]:
# For visibility, display how the dataframe looks at the beginning
df_age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38203 entries, 0 to 38202
Data columns (total 5 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Entity                                                 38203 non-null  object 
 1   Code                                                   35938 non-null  object 
 2   Year                                                   38203 non-null  int64  
 3   Median age - Sex: all - Age: all - Variant: estimates  18722 non-null  float64
 4   Median age - Sex: all - Age: all - Variant: medium     19481 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.5+ MB


In [16]:
# Renaming to have homogenous column names
df_age = df_age.rename(
    columns={
        "Median age - Sex: all - Age: all - Variant: estimates": 
        "median_age",
        "Entity": "country",
        "Year": "year",
    }
)[["median_age", "country", "year"]]

In [17]:
# For visibility, display how the dataframe looks at the end
df_age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38203 entries, 0 to 38202
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   median_age  18722 non-null  float64
 1   country     38203 non-null  object 
 2   year        38203 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 895.5+ KB


# Save preprocessed data

In [18]:
df_apples.to_csv("../data/processed/apple-consumption.csv", index=False)
df_visits.to_csv("../data/processed/doctor-visits.csv", index=False)
df_gdp.to_csv("../data/processed/gdp.csv", index=False)
df_age.to_csv("../data/processed/median-age.csv", index=False)