In [1]:
import pandas as pd

# Load the data

In [2]:
df_apples = pd.read_csv("../data/fruit-consumption-by-fruit-type.csv")
df_visits = pd.read_csv("../data/doctor-visits-raw.csv")

# Data Processing

## Doctors visits

In [3]:
# For visibility, display how the dataframe looks at the beginning
df_visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2652 entries, 0 to 2651
Data columns (total 54 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   STRUCTURE             2652 non-null   object 
 1   STRUCTURE_ID          2652 non-null   object 
 2   STRUCTURE_NAME        2652 non-null   object 
 3   ACTION                2652 non-null   object 
 4   REF_AREA              2652 non-null   object 
 5   Reference area        2652 non-null   object 
 6   MEASURE               2652 non-null   object 
 7   Measure               2652 non-null   object 
 8   UNIT_MEASURE          2652 non-null   object 
 9   Unit of measure       2652 non-null   object 
 10  MEDICAL_PROCEDURE     2652 non-null   object 
 11  Medical procedure     2652 non-null   object 
 12  OCCUPATION            2652 non-null   object 
 13  Occupation            2652 non-null   object 
 14  DIAGNOSTIC_TYPE       2652 non-null   object 
 15  Diagnostic category  

In [4]:
# Select only in person visits
df_visits = df_visits[df_visits["Consultation type"] == "In-person"]

df_visits = df_visits.rename(
    columns={
        "OBS_VALUE": "visits_pro_capita",
        "Reference area": "country",
        "TIME_PERIOD": "year",
    }
)

In [5]:
# Summing dentists and medical doctors (they both count as a visit)
df_visits = df_visits.groupby(["country", "year"])["visits_pro_capita"].sum().reset_index()

In [6]:
# Rename some countries to ensure consistent and homogeneous naming.
# Assumption: 'Korea' refers to 'South Korea' due to data unavailability for North Korea.
df_visits["country"] = df_visits["country"].replace({
    "Korea": "South Korea",
    "Slovak Republic": "Slovakia",
    "Türkiye": "Turkey"
})

In [7]:
# For visibility, display how the dataframe looks at the end
df_visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1515 entries, 0 to 1514
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   country            1515 non-null   object 
 1   year               1515 non-null   int64  
 2   visits_pro_capita  1515 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 35.6+ KB


## Apple consumption

In [8]:
# For visibility, display how the dataframe looks at the beginning
df_apples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12592 entries, 0 to 12591
Data columns (total 14 columns):
 #   Column                                                                                                                  Non-Null Count  Dtype  
---  ------                                                                                                                  --------------  -----  
 0   Entity                                                                                                                  12592 non-null  object 
 1   Code                                                                                                                    9973 non-null   object 
 2   Year                                                                                                                    12592 non-null  int64  
 3   Dates | 00002619 || Food available for consumption | 0645pc || kilograms per year per capita                            11505 non-null  float

In [9]:
# Select only apples, that's what we are interested in
df_apples = df_apples.rename(
    columns={
        "Apples | 00002617 || Food available for consumption | 0645pc || kilograms per year per capita": 
        "consumption_pro_capita_kg",
        "Entity": "country",
        "Year": "year",
    }
)

In [10]:
# For visibility, display how the dataframe looks at the end
df_apples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12592 entries, 0 to 12591
Data columns (total 14 columns):
 #   Column                                                                                                                  Non-Null Count  Dtype  
---  ------                                                                                                                  --------------  -----  
 0   country                                                                                                                 12592 non-null  object 
 1   Code                                                                                                                    9973 non-null   object 
 2   year                                                                                                                    12592 non-null  int64  
 3   Dates | 00002619 || Food available for consumption | 0645pc || kilograms per year per capita                            11505 non-null  float

# Save preprocessed data

In [11]:
df_apples.to_csv("../data/apple-consumption.csv")
df_visits.to_csv("../data/doctor-visits.csv")