In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as stats
import os

In [None]:
# Read in the csv
df = pd.read_csv("../Resources/Autism.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# View all columns
pd.set_option('display.max_columns', None)

print(df.shape)
df.head()

# Create bar chart for Ethnic Breakdown

In [None]:
# Look at general ethnicity breakdown

# Count the ethnicities in the df
ethnicities = df['Ethnicity'].value_counts()
 
# Create the dataframe
ethnic_df = pd.DataFrame({"Total Count": ethnicities})
ethnic_df

In [None]:
# Plot the ethnicity breakdown

plt.figure(figsize=(20,10))
sns.set(style="white")

ax = sns.countplot(y="Ethnicity", data=df,
                   palette="tab10", alpha=1,
                   linewidth=2,
                   edgecolor=sns.color_palette("twilight", 1))


ax.set_title("Participant Ethnicity Breakdown", fontsize=25)
ax.set_ylabel("Ethnicity", fontsize=20)
ax.set_xlabel("Total", fontsize=20)

# Save figure to Images folder
plt.savefig(os.path.join("..","Images", "EthnicityBreakdown.png"))

# Autism Rates by Ethnicity

In [None]:
# Look at the autism rate among the ethnicities

ethnicity_list = ["white", "middle eastern", "black", "asian", "others", "south asians",
                 "latino", "hispanic", "aboriginal"]

for this_ethnicity in ethnicity_list:
    ethnicity_name = this_ethnicity.capitalize()
    print(ethnicity_name)
    
    # Locate where ethnicity is white and get a count
    current_pop = df.loc[df['Ethnicity'] == this_ethnicity]
    total = current_pop['Case No'].count()
    print(f"{ethnicity_name} ethnicity total count: {total}")

    # Now locate within that ethnicity where autism classification is yes and get a count
    this_group_yes = current_pop.loc[current_pop['Class'] == 'YES']
    this_group_total = this_group_yes['Case No'].count()
    print(f"{ethnicity_name} sample population with autism: {this_group_total}")
    
    # added this to give some separation:
    print("============")


In [None]:
# Percentages

# With variables created above, write equations to find percentages
wp = whites_yes_total / total_whites
mep = me_yes_total / total_me
ap = asian_yes_total / total_asian
bp = black_yes_total / total_black
op = other_yes_total / total_other
sap = sa_yes_total / total_sa
lp = latino_yes_total / total_latino
hp = hisp_yes_total / total_hisp
abp = ab_yes_total / total_ab

# Place into a dataframe
ethnic_yes_breakdown = pd.DataFrame([{"White": wp,
                                      "Middle Eastern": mep,
                                      "Asian": ap,
                                      "Black": bp,
                                      "Other": op,
                                      "South Asian": sap,
                                      "Latino": lp,
                                      "Hispanic": hp,
                                      "Aboriginal": abp}])

# Preview
ethnic_yes_breakdown

In [None]:
# Format the dataframe

# Percentage format
ethnic_yes_breakdown = ethnic_yes_breakdown.applymap("{:,.2%}".format)

# Rename the index
ethnic_yes_breakdown = ethnic_yes_breakdown.rename(index={0:'Percentage with Autism'})

# Transpose for easier reading
ethnic_yes_breakdown = ethnic_yes_breakdown.T
ethnic_yes_breakdown

# Observations
As found in 'Other_Stats', the percentage of participants with autism is ~32%, without ~68%.  What can we really tell from the ethnic data that we have?  Could the 'white' designation be an indictor?  We looked at that because it was the first ethnicity to appear on the feature selection, and it appeared higher than even sex or family history, which was surprising to us.  White might be more of a factor than other ethnicities because it's larger than all the others.     

This could be something to explore more in-depth.  Given the accuracy with machine learning, how might ethnicity play a role?  It could be helpful to analyze more evenly distributed ethnic classification.  Because Autism is a global medical diagnosis, further ethnic analysis could help to hone ML detection globally.  Or upon further analysis, we might find that ethnicity is of very small significance.