In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as stats
import os

In [None]:
# Read in the csv
df = pd.read_csv("../Resources/Autism.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# View all columns
pd.set_option('display.max_columns', None)

print(df.shape)
df.head()

# Create bar chart for Ethnic Breakdown

In [None]:
# Look at general ethnicity breakdown

# Count the ethnicities in the df
ethnicities = df['Ethnicity'].value_counts()
 
# Create the dataframe
ethnic_df = pd.DataFrame({"Total Count": ethnicities})
ethnic_df

In [None]:
# Plot the ethnicity breakdown

plt.figure(figsize=(20,10))
sns.set(style="white")

ax = sns.countplot(y="Ethnicity", data=df,
                   palette="tab10", alpha=1,
                   linewidth=2,
                   edgecolor=sns.color_palette("twilight", 1))


ax.set_title("Participant Ethnicity Breakdown", fontsize=25)
ax.set_ylabel("Ethnicity", fontsize=20)
ax.set_xlabel("Total", fontsize=20)

# Save figure to Images folder
plt.savefig(os.path.join("..","Images", "EthnicityBreakdown.png"))

# Autism Rates by Ethnicity

In [None]:
# Look at the autism rate among the ethnicities

# White

# Locate where ethnicity is white and get a count
white_pop = df.loc[df['Ethnicity'] == 'white']
total_whites = white_pop['Case No'].count()
print(f"White ethnicity total count: {total_whites}")

# Now locate within that ethnicity where autism classification is yes and get a count
white_yes = white_pop.loc[white_pop['Class'] == 'YES']
whites_yes_total = white_yes['Case No'].count()
print(f"White sample population with autism: {whites_yes_total}")


# Middle Eastern
# Locate where ethnicity is middle eastern and get a count
me_pop = df.loc[df['Ethnicity'] == 'middle eastern']
total_me = me_pop['Case No'].count()
print(f"Middle Eastern ethnicity total count: {total_me}")

# Now locate within that ethnicity where autism classification is yes and get a count
me_yes = me_pop.loc[me_pop['Class'] == 'YES']
me_yes_total = me_yes['Case No'].count()
print(f"Middle Eastern sample population with autism: {me_yes_total}")


# Asian

# Locate where ethnicity is asian and get a count
asian_pop = df.loc[df['Ethnicity'] == 'asian']
total_asian = asian_pop['Case No'].count()
print(f"Asian ethnicity total count: {total_asian}")

# Now locate within that ethnicity where autism classification is yes and get a count
asian_yes = asian_pop.loc[asian_pop['Class'] == 'YES']
asian_yes_total = asian_yes['Case No'].count()
print(f"Asian sample population with autism: {asian_yes_total}")


# Black

# Locate where ethnicity is black and get a count
black_pop = df.loc[df['Ethnicity'] == 'black']
total_black = black_pop['Case No'].count()
print(f"Black ethnicity total count: {total_black}")

# Now locate within that ethnicity where autism classification is yes and get a count
black_yes = black_pop.loc[black_pop['Class'] == 'YES']
black_yes_total = black_yes['Case No'].count()
print(f"Black sample population with autism: {black_yes_total}")


# Others

# Locate where ethnicity is other and get a count
other_pop = df.loc[df['Ethnicity'] == 'others ']
total_other = other_pop['Case No'].count()
print(f"Other ethnicity total count: {total_other}")

# Now locate within that ethnicity where autism classification is yes and get a count
other_yes = other_pop.loc[other_pop['Class'] == 'YES']
other_yes_total = other_yes['Case No'].count()
print(f"Other sample population with autism: {other_yes_total}")


# South Asians

# Locate where ethnicity is south asian and get a count
sa_pop = df.loc[df['Ethnicity'] == 'south asians']
total_sa = sa_pop['Case No'].count()
print(f"South Asian ethnicity total count: {total_sa}")

# Now locate within that ethnicity where autism classification is yes and get a count
sa_yes = sa_pop.loc[sa_pop['Class'] == 'YES']
sa_yes_total = sa_yes['Case No'].count()
print(f"South Asian sample population with autism: {sa_yes_total}")


# Latino

# Locate where ethnicity is latino and get a count
latino_pop = df.loc[df['Ethnicity'] == 'latino']
total_latino = latino_pop['Case No'].count()
print(f"Latino ethnicity total count: {total_latino}")

# Now locate within that ethnicity where autism classification is yes and get a count
latino_yes = latino_pop.loc[latino_pop['Class'] == 'YES']
latino_yes_total = latino_yes['Case No'].count()
print(f"Latino sample population with autism: {latino_yes_total}")


# Hispanic

# Locate where ethnicity is hispanic and get a count
hisp_pop = df.loc[df['Ethnicity'] == 'hispanic']
total_hisp = hisp_pop['Case No'].count()
print(f"Hispanic ethnicity total count: {total_hisp}")

# Now locate within that ethnicity where autism classification is yes and get a count
hisp_yes = hisp_pop.loc[hisp_pop['Class'] == 'YES']
hisp_yes_total = hisp_yes['Case No'].count()
print(f"Hispanic sample population with autism: {hisp_yes_total}")


# Aboriginal

# Locate where ethnicity is aboriginal and get a count
ab_pop = df.loc[df['Ethnicity'] == 'aboriginal']
total_ab = ab_pop['Case No'].count()
print(f"Aboriginal ethnicity total count: {total_ab}")

# Now locate within that ethnicity where autism classification is yes and get a count
ab_yes = ab_pop.loc[ab_pop['Class'] == 'YES']
ab_yes_total = ab_yes['Case No'].count()
print(f"Aboriginal sample population with autism: {ab_yes_total}")

In [None]:
# Percentages

# With variables created above, write equations to find percentages
wp = whites_yes_total / total_whites
mep = me_yes_total / total_me
ap = asian_yes_total / total_asian
bp = black_yes_total / total_black
op = other_yes_total / total_other
sap = sa_yes_total / total_sa
lp = latino_yes_total / total_latino
hp = hisp_yes_total / total_hisp
abp = ab_yes_total / total_ab

# Place into a dataframe
ethnic_yes_breakdown = pd.DataFrame([{"White": wp,
                                      "Middle Eastern": mep,
                                      "Asian": ap,
                                      "Black": bp,
                                      "Other": op,
                                      "South Asian": sap,
                                      "Latino": lp,
                                      "Hispanic": hp,
                                      "Aboriginal": abp}])

# Preview
ethnic_yes_breakdown

In [None]:
# Format the dataframe

# Percentage format
ethnic_yes_breakdown = ethnic_yes_breakdown.applymap("{:,.2%}".format)

# Rename the index
ethnic_yes_breakdown = ethnic_yes_breakdown.rename(index={0:'Percentage with Autism'})

# Transpose for easier reading
ethnic_yes_breakdown = ethnic_yes_breakdown.T
ethnic_yes_breakdown

# Observations
As found in 'Other_Stats', the percentage of participants with autism is ~32%, without ~68%.  What can we really tell from the ethnic data that we have?  Could the 'white' designation be an indictor?  We looked at that because it was the first ethnicity to appear on the feature selection, and it appeared higher than even sex or family history, which was surprising to us.  White might be more of a factor than other ethnicities because it's larger than all the others.     

This could be something to explore more in-depth.  Given the accuracy with machine learning, how might ethnicity play a role?  It could be helpful to analyze more evenly distributed ethnic classification.  Because Autism is a global medical diagnosis, further ethnic analysis could help to hone ML detection globally.  Or upon further analysis, we might find that ethnicity is of very small significance.