In [1]:
import os
from pyspark.sql import SparkSession, types
from pyspark.sql import functions as F
from pyspark.sql import Row

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/05 23:40:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd

In [3]:
def save_pandas_csv(df, location, filename):
    df.to_csv(location + "/" + filename + '.csv', index=False)

#### Analysis bits

In [4]:
# MHDD schema
mhdd_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("schizophrenia_%", types.FloatType()),
    types.StructField("bipolar_disorder_%", types.FloatType()),
    types.StructField("eating_disorders_%", types.FloatType()),
    types.StructField("anxiety_disorders_%", types.FloatType()),
    types.StructField("drug_use_disorders_%", types.FloatType()),
    types.StructField("depression_%", types.FloatType()),
    types.StructField("alcohol_use_disorders_%", types.FloatType()),
])

In [5]:
# Load MHDD dataset
mhdd_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/mhdd", schema=mhdd_schema)
mhdd_df.cache()

DataFrame[country_name: string, country_code: string, un_region: string, year: int, schizophrenia_%: float, bipolar_disorder_%: float, eating_disorders_%: float, anxiety_disorders_%: float, drug_use_disorders_%: float, depression_%: float, alcohol_use_disorders_%: float]

In [6]:
mhdd_df.count()

                                                                                

4424

In [7]:
# mhdd_viz_df = mhdd_df.select(["year","country_name","schizophrenia_%"]).toPandas()
mhdd_viz_df = mhdd_df.toPandas() # Save this

In [8]:
save_pandas_csv(mhdd_viz_df, "csv_out/gmh", "mhdd_viz_df")

In [9]:
def generate_top_bottom_mean_for_problem_in_mhdd(problem):
    # Select year, country_name, problem and add average as row
    problem_average = mhdd_df.groupby("year").agg(F.avg(problem).alias(problem)).withColumn("country_name", F.lit("Mean")).select(["year", "country_name", problem])

    problem_df = mhdd_df.select(["year", "country_name", problem])

    problem_df_final = problem_df.union(problem_average)

    # Find top 5 and bottom 5 countries
    top_bottom = mhdd_df.groupBy("country_name").agg(F.avg(problem).alias(problem)).sort(problem, ascending=False)
    top_5_countries = top_bottom.select(["country_name"]).limit(5).collect()
    countries_list = [top_5_countries[i]["country_name"] for i in range(len(top_5_countries))]
    bottom_5_countries = top_bottom.select(["country_name"]).tail(5)
    countries_list = countries_list + [bottom_5_countries[i]["country_name"] for i in range(len(bottom_5_countries))]
    countries_list.append("Mean")
    return problem_df_final.select(["year", "country_name", problem]).filter((mhdd_df["country_name"].isin(countries_list))).toPandas()

In [10]:
# Save these
sch_df = generate_top_bottom_mean_for_problem_in_mhdd("schizophrenia_%")
bpd_df = generate_top_bottom_mean_for_problem_in_mhdd("bipolar_disorder_%")
ed_df = generate_top_bottom_mean_for_problem_in_mhdd("eating_disorders_%")
ad_df = generate_top_bottom_mean_for_problem_in_mhdd("anxiety_disorders_%")
du_df = generate_top_bottom_mean_for_problem_in_mhdd("drug_use_disorders_%")
au_df = generate_top_bottom_mean_for_problem_in_mhdd("alcohol_use_disorders_%")
dep_df = generate_top_bottom_mean_for_problem_in_mhdd("depression_%")

                                                                                

In [11]:
save_pandas_csv(sch_df, "csv_out/gmh", "sch_df")
save_pandas_csv(bpd_df, "csv_out/gmh", "bpd_df")
save_pandas_csv(ed_df, "csv_out/gmh", "ed_df")
save_pandas_csv(ad_df, "csv_out/gmh", "ad_df")
save_pandas_csv(du_df, "csv_out/gmh", "du_df")
save_pandas_csv(au_df, "csv_out/gmh", "au_df")
save_pandas_csv(dep_df, "csv_out/gmh", "dep_df")

Which income group, un_region had the worst of the problems?

Merge with atlas countries basic facts

Group by income_group/un_region

Compute the average of each problem over the years and take the maximum

Plot average for that problem for that income_group/un_region

In [12]:
atlas_countries_facts_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("facts_country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
])

atlas_countries_facts = spark.read.csv("datasets_output/atlas_fact_dims/atlas_countries_facts/", schema=atlas_countries_facts_schema)

In [13]:
atlas_countries_basic_info_schema = types.StructType([
    types.StructField("basic_country_code", types.StringType()),
    types.StructField("population", types.IntegerType()),
    types.StructField("income_group", types.StringType()),
    types.StructField("who_region", types.StringType()),
    types.StructField("expenditure_cad", types.FloatType()),
])

atlas_countries_basic_info = spark.read.csv("datasets_output/atlas_fact_dims/atlas_countries_basic_info_dims/", schema=atlas_countries_basic_info_schema)

In [14]:
atlas_basic_combined = atlas_countries_basic_info.join(atlas_countries_facts.hint("broadcast"), (atlas_countries_basic_info["basic_country_code"]==atlas_countries_facts["facts_country_code"]))

In [15]:
atlas_basic_combined.filter(atlas_basic_combined["income_group"].isNull()).count()

0

In [16]:
atlas_basic_combined = atlas_basic_combined.drop("facts_country_code").drop("un_region").drop("who_region").drop("country_name")

In [17]:
atlas_basic_combined.printSchema()

root
 |-- basic_country_code: string (nullable = true)
 |-- population: integer (nullable = true)
 |-- income_group: string (nullable = true)
 |-- expenditure_cad: float (nullable = true)



In [18]:
combined_mhdd_df = mhdd_df.join(atlas_basic_combined.hint("broadcast"), (mhdd_df["country_code"]==atlas_basic_combined["basic_country_code"])).drop("basic_country_code")

In [19]:
combined_mhdd_df.printSchema()

root
 |-- country_name: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- un_region: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- schizophrenia_%: float (nullable = true)
 |-- bipolar_disorder_%: float (nullable = true)
 |-- eating_disorders_%: float (nullable = true)
 |-- anxiety_disorders_%: float (nullable = true)
 |-- drug_use_disorders_%: float (nullable = true)
 |-- depression_%: float (nullable = true)
 |-- alcohol_use_disorders_%: float (nullable = true)
 |-- population: integer (nullable = true)
 |-- income_group: string (nullable = true)
 |-- expenditure_cad: float (nullable = true)



In [20]:
combined_mhdd_df.filter(combined_mhdd_df["income_group"].isNull()).count()

0

In [21]:
def problematic_region_df(group, problem):
    group_val = combined_mhdd_df.groupby(group).agg(F.avg(problem).alias(problem)).orderBy(F.desc(problem)).first()[group]
    viz_df = combined_mhdd_df.filter(combined_mhdd_df[group]==group_val).groupby("country_name").agg(F.avg(problem).alias(problem))
    return (group_val, viz_df.toPandas())

In [22]:
# Save the dfs here
sch_un_group, sch_df_un_region = problematic_region_df("un_region", "schizophrenia_%")
sch_inc_group, sch_df_income_group = problematic_region_df("income_group", "schizophrenia_%")
bpd_un_group, bpd_df_un_region = problematic_region_df("un_region", "bipolar_disorder_%")
bpd_inc_group, bpd_df_income_group = problematic_region_df("income_group", "bipolar_disorder_%")
ed_un_group, ed_df_un_region = problematic_region_df("un_region", "eating_disorders_%")
ed_inc_group, ed_df_income_group = problematic_region_df("income_group", "eating_disorders_%")
ad_un_group, ad_df_un_region = problematic_region_df("un_region", "anxiety_disorders_%")
ad_inc_group, ad_df_income_group = problematic_region_df("income_group", "anxiety_disorders_%")
du_un_group, du_df_un_region = problematic_region_df("un_region", "drug_use_disorders_%")
du_inc_group, du_df_income_group = problematic_region_df("income_group", "drug_use_disorders_%")
dep_un_group, dep_df_un_region = problematic_region_df("un_region", "depression_%")
dep_inc_group, dep_df_income_group = problematic_region_df("income_group", "depression_%")
au_un_group, aud_df_un_region = problematic_region_df("un_region", "alcohol_use_disorders_%")
au_inc_group, aud_df_income_group = problematic_region_df("income_group", "alcohol_use_disorders_%")

In [23]:
print(sch_un_group)
print(sch_inc_group)
print(bpd_un_group)
print(bpd_inc_group)
print(ed_un_group)
print(ed_inc_group)
print(ad_un_group)
print(ad_inc_group)
print(du_un_group)
print(du_inc_group)
print(dep_un_group)
print(dep_inc_group)
print(au_un_group)
print(au_inc_group)

Australia and New Zealand
high
Australia and New Zealand
high
Australia and New Zealand
high
Australia and New Zealand
high
Northern America
high
Australia and New Zealand
low
Eastern Europe
upper-middle


In [24]:
save_pandas_csv(sch_df_un_region, "csv_out/gmh", "sch_df_un_region")
save_pandas_csv(bpd_df_un_region, "csv_out/gmh", "bpd_df_un_region")
save_pandas_csv(ed_df_un_region, "csv_out/gmh", "ed_df_un_region")
save_pandas_csv(ad_df_un_region, "csv_out/gmh", "ad_df_un_region")
save_pandas_csv(du_df_un_region, "csv_out/gmh", "du_df_un_region")
save_pandas_csv(aud_df_un_region, "csv_out/gmh", "au_df_un_region")
save_pandas_csv(dep_df_un_region, "csv_out/gmh", "dep_df_un_region")
save_pandas_csv(sch_df_income_group, "csv_out/gmh", "sch_df_income_group")
save_pandas_csv(bpd_df_income_group, "csv_out/gmh", "bpd_df_income_group")
save_pandas_csv(ed_df_income_group, "csv_out/gmh", "ed_df_income_group")
save_pandas_csv(ad_df_income_group, "csv_out/gmh", "ad_df_income_group")
save_pandas_csv(du_df_income_group, "csv_out/gmh", "du_df_income_group")
save_pandas_csv(aud_df_income_group, "csv_out/gmh", "au_df_income_group")
save_pandas_csv(dep_df_income_group, "csv_out/gmh", "dep_df_income_group")

How much has each problem grown in each region?

In [25]:
def generate_problem_growth_df(combined_mhdd_df, problem):
    test = combined_mhdd_df.select(["un_region", "year", problem]).groupby("un_region", "year").agg(
        F.avg(problem).alias(problem),
    ).filter((F.col("year")==1990)|(F.col("year")==2017))

    test_1990 = test.filter(test["year"]==1990).drop("year").withColumnsRenamed({
        problem: problem + "_1990",
        "un_region": "un_region_1990"
    })

    test_2017 = test.filter(test["year"]==2017).drop("year").withColumnsRenamed({
        problem: problem + "_2017",
        "un_region": "un_region_2017"
    })

    diff_df = test_1990.join(test_2017, (test_1990["un_region_1990"]==test_2017["un_region_2017"])).drop("un_region_2017").withColumnRenamed("un_region_1990", "un_region")
    diff_df = diff_df.withColumn(problem.capitalize()+" growth", F.col(problem+"_2017")-F.col(problem+"_1990")).drop(problem+"_1990").drop(problem+"_2017")

    return diff_df.toPandas()

In [26]:
# Save these
sch_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "schizophrenia_%")
ad_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "anxiety_disorders_%")
au_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "alcohol_use_disorders_%")
ed_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "eating_disorders_%")
du_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "drug_use_disorders_%")
bpd_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "bipolar_disorder_%")
dep_problem_growth_viz_df = generate_problem_growth_df(combined_mhdd_df, "depression_%")

In [27]:
save_pandas_csv(sch_problem_growth_viz_df, "csv_out/gmh", "sch_problem_growth_viz_df")
save_pandas_csv(ad_problem_growth_viz_df, "csv_out/gmh", "ad_problem_growth_viz_df")
save_pandas_csv(au_problem_growth_viz_df, "csv_out/gmh", "au_problem_growth_viz_df")
save_pandas_csv(ed_problem_growth_viz_df, "csv_out/gmh", "ed_problem_growth_viz_df")
save_pandas_csv(du_problem_growth_viz_df, "csv_out/gmh", "du_problem_growth_viz_df")
save_pandas_csv(bpd_problem_growth_viz_df, "csv_out/gmh", "bpd_problem_growth_viz_df")
save_pandas_csv(dep_problem_growth_viz_df, "csv_out/gmh", "dep_problem_growth_viz_df")

In [28]:
def generate_correlation_heatmap_df(combined_mhdd_df):
    averaged_df = combined_mhdd_df.groupby("un_region", "year").agg(F.avg(F.col("schizophrenia_%")).alias("Schizophrenia"), F.avg(F.col("depression_%")).alias("Depression"), F.avg(F.col("bipolar_disorder_%")).alias("Bipolar disorder"), F.avg(F.col("eating_disorders_%")).alias("Eating disorders"), F.avg(F.col("anxiety_disorders_%")).alias("Anxiety disorders"), F.avg(F.col("drug_use_disorders_%")).alias("Drug use disorders"), F.avg(F.col("alcohol_use_disorders_%")).alias("Alcohol use disorders"))
    return averaged_df.select(["Schizophrenia", "Depression", "Drug use disorders", "Alcohol use disorders", "Eating disorders", "Anxiety disorders", "Bipolar disorder"]).toPandas()

In [29]:
# Save this
correlation_viz_df = generate_correlation_heatmap_df(combined_mhdd_df)

In [30]:
save_pandas_csv(correlation_viz_df, "csv_out/gmh", "correlation_viz_df")

In [31]:
# Load ad_mf_df dataset
ad_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("male_anxiety_disorders_%", types.FloatType()),
    types.StructField("female_anxiety_disorders_%", types.FloatType()),
])
ad_mf_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/ad_mf", schema=ad_schema)

In [32]:
# Load ad_mf_df dataset
bpd_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("male_bipolar_disorder_%", types.FloatType()),
    types.StructField("female_bipolar_disorder_%", types.FloatType()),
])
bpd_mf_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/bpd_mf", schema=bpd_schema)

In [33]:
# Load ad_mf_df dataset
dep_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("male_depression_%", types.FloatType()),
    types.StructField("female_depression_%", types.FloatType()),
])
dep_mf_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/dep_mf", schema=dep_schema)

In [34]:
# Load ad_mf_df dataset
sch_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("male_schizophrenia_%", types.FloatType()),
    types.StructField("female_schizophrenia_%", types.FloatType()),
])
sch_mf_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/sch_mf", schema=sch_schema)

In [35]:
# Load ad_mf_df dataset
ed_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("male_eating_disorders_%", types.FloatType()),
    types.StructField("female_eating_disorders_%", types.FloatType()),
])
ed_mf_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/ed_mf", schema=ed_schema)

In [36]:
# Save these
ad_mf_df_pd = ad_mf_df.toPandas()
bpd_mf_df_pd = bpd_mf_df.toPandas()
dep_mf_df_pd = dep_mf_df.toPandas()
sch_mf_df_pd = sch_mf_df.toPandas()
ed_mf_df_pd = ed_mf_df.toPandas()

In [37]:
save_pandas_csv(ad_mf_df_pd, "csv_out/gmh", "ad_mf_df_pd")
save_pandas_csv(bpd_mf_df_pd, "csv_out/gmh", "bpd_mf_df_pd")
save_pandas_csv(dep_mf_df_pd, "csv_out/gmh", "dep_mf_df_pd")
save_pandas_csv(sch_mf_df_pd, "csv_out/gmh", "sch_mf_df_pd")
save_pandas_csv(ed_mf_df_pd, "csv_out/gmh", "ed_mf_df_pd")

In [38]:
# Load ad_mf_df dataset
sui_schema = types.StructType([
    types.StructField("country_name", types.StringType()),
    types.StructField("country_code", types.StringType()),
    types.StructField("un_region", types.StringType()),
    types.StructField("year", types.IntegerType()),
    types.StructField("male_suicide_%", types.FloatType()),
    types.StructField("female_suicide_%", types.FloatType()),
])
sui_mf_df = spark.read.option("multiline", "true").csv("datasets_output/gmh_fact_dims/sui_mf", schema=sui_schema)

In [39]:
sui_mf_df.show() # This has only 2019 data

+--------------------+------------+---------------+----+--------------+----------------+
|        country_name|country_code|      un_region|year|male_suicide_%|female_suicide_%|
+--------------------+------------+---------------+----+--------------+----------------+
|Co-operative Repu...|         GUY|  South America|2019|          40.3|            63.0|
| Kingdom of Eswatini|         SWZ|Southern Africa|2019|          29.4|            55.1|
|   Republic of Korea|         KOR|   Eastern Asia|2019|          28.6|            40.2|
|Republic of Kiribati|         KIR|     Micronesia|2019|          28.3|            48.6|
|Federated States ...|         FSM|     Micronesia|2019|          28.2|            43.2|
|Republic of Lithu...|         LTU|Northern Europe|2019|          26.1|            45.4|
|Republic of Suriname|         SUR|  South America|2019|          25.4|            38.8|
|  Russian Federation|         RUS| Eastern Europe|2019|          25.1|            43.6|
|Republic of South...

In [40]:
# Find average suicide % by region
region_avg_sui_mf_df = sui_mf_df.select(["un_region", "year", "male_suicide_%", "female_suicide_%"]).groupby("un_region", "year").agg(F.avg("male_suicide_%").alias("male_suicide_%"), F.avg("female_suicide_%").alias("female_suicide_%"))

In [41]:
region_avg_sui_mf_df.show()

+--------------------+----+------------------+------------------+
|           un_region|year|    male_suicide_%|  female_suicide_%|
+--------------------+----+------------------+------------------+
|Australia and New...|2019|             11.75|17.550000190734863|
|       Middle Africa|2019| 7.679999923706054|11.480000114440918|
|          Micronesia|2019|             28.25| 45.89999961853027|
|      Western Europe|2019|14.724999904632568| 21.02500009536743|
|           Polynesia|2019| 3.799999952316284|               5.0|
|      Eastern Africa|2019| 7.507142918450492|11.421428510120936|
|       Southern Asia|2019| 6.524999916553497|10.024999856948853|
|      Western Africa|2019| 6.321428571428571| 9.299999986376081|
|       South America|2019|11.649999916553497| 18.19166648387909|
|     Northern Africa|2019| 4.049999952316284| 5.399999936421712|
|           Melanesia|2019|11.174999952316284| 17.90000009536743|
|     Central America|2019| 5.237500071525574|  8.72500005364418|
|  South-e

In [42]:
# Find top 5 and bottom 5 by male and female
top_male_region_sui_df = region_avg_sui_mf_df.orderBy(F.desc("male_suicide_%")).limit(5)
top_female_region_sui_df = region_avg_sui_mf_df.orderBy(F.desc("female_suicide_%")).limit(5)
bottom_male_region_sui_df = region_avg_sui_mf_df.orderBy("male_suicide_%").limit(5)
bottom_female_region_sui_df = region_avg_sui_mf_df.orderBy("female_suicide_%").limit(5)

# Union and then drop duplicates
combined_sui_df = top_male_region_sui_df.union(top_female_region_sui_df).union(bottom_male_region_sui_df).union(bottom_female_region_sui_df).drop_duplicates()

In [43]:
combined_sui_df.show()

+------------------+----+------------------+-----------------+
|         un_region|year|    male_suicide_%| female_suicide_%|
+------------------+----+------------------+-----------------+
|        Micronesia|2019|             28.25|45.89999961853027|
|   Southern Africa|2019|19.674999952316284|33.92499923706055|
|      Eastern Asia|2019|17.475000143051147|25.62500023841858|
|    Eastern Europe|2019| 17.17142881665911|29.55714293888637|
|    Western Europe|2019|14.724999904632568|21.02500009536743|
|   Northern Europe|2019|14.300000047683715|             22.6|
|      Western Asia|2019| 4.500000009169946|6.307692252672636|
|         Caribbean|2019| 5.350000050663948|8.280000007152557|
|   Northern Africa|2019| 4.049999952316284|5.399999936421712|
|   Central America|2019| 5.237500071525574| 8.72500005364418|
|         Polynesia|2019| 3.799999952316284|              5.0|
|South-eastern Asia|2019|  5.37000002861023|7.999999928474426|
+------------------+----+------------------+-----------

In [44]:
# Save this
combined_sui_df_pd = combined_sui_df.toPandas()

In [45]:
save_pandas_csv(combined_sui_df_pd, "csv_out/gmh", "combined_sui_df_pd")