# Import Dependencies

In [100]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
import sqlalchemy
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy import Table
from sqlalchemy.ext.declarative import declarative_base

In [83]:
adhd_df = pd.read_csv('raw-data/adhd_raw.csv')
asperger_df = pd.read_csv('raw-data/asperger_raw.csv')
autism_df = pd.read_csv('raw-data/autism_raw.csv')
idd_df = pd.read_csv('raw-data/idd_raw.csv')

# Cleaning ADHD Data

In [84]:
adhd_df.head()

Unnamed: 0,Entity,Code,Year,Prevalence - Attention-deficit/hyperactivity disorder - Sex: Male - Age: Age-standardized (Percent),Prevalence - Attention-deficit/hyperactivity disorder - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,1.559695,0.683613,12412311.0,
2,Afghanistan,AFG,1991,1.560023,0.683845,13299016.0,
3,Afghanistan,AFG,1992,1.560413,0.684081,14485543.0,
4,Afghanistan,AFG,1993,1.560768,0.68433,15816601.0,


In [85]:
# we're only focusing on years 1990+, no other values so we're filtering the rows with numbers other than 1990+
adhd_df_filtered = adhd_df[adhd_df['Year'] >= 1990]
adhd_df_filtered

Unnamed: 0,Entity,Code,Year,Prevalence - Attention-deficit/hyperactivity disorder - Sex: Male - Age: Age-standardized (Percent),Prevalence - Attention-deficit/hyperactivity disorder - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,1.559695,0.683613,12412311.0,
2,Afghanistan,AFG,1991,1.560023,0.683845,13299016.0,
3,Afghanistan,AFG,1992,1.560413,0.684081,14485543.0,
4,Afghanistan,AFG,1993,1.560768,0.684330,15816601.0,
...,...,...,...,...,...,...,...
56624,Zimbabwe,ZWE,2018,,,14438812.0,
56625,Zimbabwe,ZWE,2019,,,14645473.0,
56626,Zimbabwe,ZWE,2020,,,14862927.0,
56627,Zimbabwe,ZWE,2021,,,15092171.0,


In [86]:
# dropping all NA values
adhd_df_filtered = adhd_df_filtered.dropna()

In [87]:
# setting Year as index since we'll be graphing and portraying each condition over the years
adhd_df_filtered = adhd_df_filtered.set_index('Year')
adhd_df_filtered

Unnamed: 0_level_0,Entity,Code,Prevalence - Attention-deficit/hyperactivity disorder - Sex: Male - Age: Age-standardized (Percent),Prevalence - Attention-deficit/hyperactivity disorder - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015,Afghanistan,AFG,1.581956,0.692681,34413603.0,Asia
2015,Albania,ALB,1.429911,0.594459,2890524.0,Europe
2015,Algeria,DZA,1.654917,0.714541,39728020.0,Africa
2015,American Samoa,ASM,1.266399,0.527042,55806.0,Oceania
2015,Andorra,AND,1.510707,0.540360,77993.0,Europe
...,...,...,...,...,...,...
2015,Venezuela,VEN,1.911358,0.816272,30081827.0,South America
2015,Vietnam,VNM,0.814008,0.378249,92677082.0,Asia
2015,Yemen,YEM,1.182115,0.399807,26497881.0,Asia
2015,Zambia,ZMB,1.903781,0.831736,15879370.0,Africa


In [101]:
# checking data types of values
adhd_df_filtered.dtypes

Entity                                                                                                    object
Code                                                                                                      object
Prevalence - Attention-deficit/hyperactivity disorder - Sex: Male - Age: Age-standardized (Percent)      float64
Prevalence - Attention-deficit/hyperactivity disorder - Sex: Female - Age: Age-standardized (Percent)    float64
Population (historical estimates)                                                                        float64
Continent                                                                                                 object
dtype: object

# Cleaning Asperger's Data

In [88]:
asperger_df.head()

Unnamed: 0,Entity,Code,Year,Prevalence - Asperger syndrome and other autistic spectrum disorders - Sex: Male - Age: Age-standardized (Percent),Prevalence - Asperger syndrome and other autistic spectrum disorders - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,1.045411,0.236704,12412311.0,
2,Afghanistan,AFG,1991,1.045548,0.236713,13299016.0,
3,Afghanistan,AFG,1992,1.045715,0.236731,14485543.0,
4,Afghanistan,AFG,1993,1.04577,0.236749,15816601.0,


In [89]:
# we're only focusing on years 1990+, no other values so we're filtering the rows with numbers other than 1990+
asperger_df_filtered = asperger_df[asperger_df['Year'] >= 1990]
asperger_df_filtered

Unnamed: 0,Entity,Code,Year,Prevalence - Asperger syndrome and other autistic spectrum disorders - Sex: Male - Age: Age-standardized (Percent),Prevalence - Asperger syndrome and other autistic spectrum disorders - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,1.045411,0.236704,12412311.0,
2,Afghanistan,AFG,1991,1.045548,0.236713,13299016.0,
3,Afghanistan,AFG,1992,1.045715,0.236731,14485543.0,
4,Afghanistan,AFG,1993,1.045770,0.236749,15816601.0,
...,...,...,...,...,...,...,...
56510,Zimbabwe,ZWE,2018,,,14438812.0,
56511,Zimbabwe,ZWE,2019,,,14645473.0,
56512,Zimbabwe,ZWE,2020,,,14862927.0,
56513,Zimbabwe,ZWE,2021,,,15092171.0,


In [90]:
# dropping all NA values
asperger_df_filtered = asperger_df_filtered.dropna()

In [91]:
# setting Year as index since we'll be graphing and portraying each condition over the years
asperger_df_filtered = asperger_df_filtered.set_index('Year')
asperger_df_filtered

Unnamed: 0_level_0,Entity,Code,Prevalence - Asperger syndrome and other autistic spectrum disorders - Sex: Male - Age: Age-standardized (Percent),Prevalence - Asperger syndrome and other autistic spectrum disorders - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015,Afghanistan,AFG,1.049301,0.237394,34413603.0,Asia
2015,Albania,ALB,1.028942,0.224858,2890524.0,Europe
2015,Algeria,DZA,1.100229,0.244128,39728020.0,Africa
2015,American Samoa,ASM,0.991599,0.220655,55806.0,Oceania
2015,Andorra,AND,0.847528,0.184133,77993.0,Europe
...,...,...,...,...,...,...
2015,Venezuela,VEN,1.022479,0.225504,30081827.0,South America
2015,Vietnam,VNM,0.977824,0.219046,92677082.0,Asia
2015,Yemen,YEM,1.055902,0.238245,26497881.0,Asia
2015,Zambia,ZMB,0.961192,0.216278,15879370.0,Africa


# Cleaning Autism Data

In [92]:
autism_df.head()

Unnamed: 0,Entity,Code,Year,Prevalence - Autism - Sex: Male - Age: Age-standardized (Percent),Prevalence - Autism - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,0.313685,0.129473,12412311.0,
2,Afghanistan,AFG,1991,0.312901,0.128978,13299016.0,
3,Afghanistan,AFG,1992,0.312296,0.128577,14485543.0,
4,Afghanistan,AFG,1993,0.311659,0.12819,15816601.0,


In [93]:
# we're only focusing on years 1990+, no other values so we're filtering the rows with numbers other than 1990+
autism_df_filtered = autism_df[autism_df['Year'] >= 1990]
autism_df_filtered

Unnamed: 0,Entity,Code,Year,Prevalence - Autism - Sex: Male - Age: Age-standardized (Percent),Prevalence - Autism - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,0.313685,0.129473,12412311.0,
2,Afghanistan,AFG,1991,0.312901,0.128978,13299016.0,
3,Afghanistan,AFG,1992,0.312296,0.128577,14485543.0,
4,Afghanistan,AFG,1993,0.311659,0.128190,15816601.0,
...,...,...,...,...,...,...,...
56510,Zimbabwe,ZWE,2018,,,14438812.0,
56511,Zimbabwe,ZWE,2019,,,14645473.0,
56512,Zimbabwe,ZWE,2020,,,14862927.0,
56513,Zimbabwe,ZWE,2021,,,15092171.0,


In [94]:
# dropping all NA values
autism_df_filtered = autism_df_filtered.dropna()

In [95]:
# setting Year as index since we'll be graphing and portraying each condition over the years
autism_df_filtered = autism_df_filtered.set_index('Year')
autism_df_filtered

Unnamed: 0_level_0,Entity,Code,Prevalence - Autism - Sex: Male - Age: Age-standardized (Percent),Prevalence - Autism - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015,Afghanistan,AFG,0.314801,0.129313,34413603.0,Asia
2015,Albania,ALB,0.417295,0.163435,2890524.0,Europe
2015,Algeria,DZA,0.386518,0.146015,39728020.0,Africa
2015,American Samoa,ASM,0.353715,0.140178,55806.0,Oceania
2015,Andorra,AND,0.504641,0.166592,77993.0,Europe
...,...,...,...,...,...,...
2015,Venezuela,VEN,0.393789,0.160352,30081827.0,South America
2015,Vietnam,VNM,0.359103,0.146298,92677082.0,Asia
2015,Yemen,YEM,0.342212,0.132953,26497881.0,Asia
2015,Zambia,ZMB,0.289946,0.130360,15879370.0,Africa


# Cleaning Intellectual Disability Data

In [96]:
idd_df.head()

Unnamed: 0,Entity,Code,Year,Prevalence - Idiopathic developmental intellectual disability - Sex: Male - Age: Age-standardized (Percent),Prevalence - Idiopathic developmental intellectual disability - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,5.716282,4.635575,12412311.0,
2,Afghanistan,AFG,1991,6.082564,4.917924,13299016.0,
3,Afghanistan,AFG,1992,6.409605,5.170105,14485543.0,
4,Afghanistan,AFG,1993,6.673168,5.373784,15816601.0,


In [97]:
# we're only focusing on years 1990+, no other values so we're filtering the rows with numbers other than 1990+
idd_df_filtered = idd_df[idd_df['Year'] >= 1990]
idd_df_filtered

Unnamed: 0,Entity,Code,Year,Prevalence - Idiopathic developmental intellectual disability - Sex: Male - Age: Age-standardized (Percent),Prevalence - Idiopathic developmental intellectual disability - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1990,5.716282,4.635575,12412311.0,
2,Afghanistan,AFG,1991,6.082564,4.917924,13299016.0,
3,Afghanistan,AFG,1992,6.409605,5.170105,14485543.0,
4,Afghanistan,AFG,1993,6.673168,5.373784,15816601.0,
...,...,...,...,...,...,...,...
56624,Zimbabwe,ZWE,2018,,,14438812.0,
56625,Zimbabwe,ZWE,2019,,,14645473.0,
56626,Zimbabwe,ZWE,2020,,,14862927.0,
56627,Zimbabwe,ZWE,2021,,,15092171.0,


In [98]:
# dropping all NA values
idd_df_filtered = idd_df_filtered.dropna()

In [99]:
# setting Year as index since we'll be graphing and portraying each condition over the years
idd_df_filtered = idd_df_filtered.set_index('Year')
idd_df_filtered

Unnamed: 0_level_0,Entity,Code,Prevalence - Idiopathic developmental intellectual disability - Sex: Male - Age: Age-standardized (Percent),Prevalence - Idiopathic developmental intellectual disability - Sex: Female - Age: Age-standardized (Percent),Population (historical estimates),Continent
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015,Afghanistan,AFG,4.654696,3.754670,34413603.0,Asia
2015,Albania,ALB,0.553948,0.502217,2890524.0,Europe
2015,Algeria,DZA,1.489152,1.130623,39728020.0,Africa
2015,American Samoa,ASM,0.375397,0.351941,55806.0,Oceania
2015,Andorra,AND,0.254538,0.323534,77993.0,Europe
...,...,...,...,...,...,...
2015,Venezuela,VEN,0.277440,0.272688,30081827.0,South America
2015,Vietnam,VNM,0.266380,1.076825,92677082.0,Asia
2015,Yemen,YEM,3.811625,3.016476,26497881.0,Asia
2015,Zambia,ZMB,0.472706,0.446366,15879370.0,Africa
