# Note - I used the first part of Workhours below in our group's presentation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from scipy.stats import linregress


In [None]:
raw_df = pd.read_csv("adult.csv")
raw_df.shape

In [None]:
raw_df.head()

In [None]:
for col in raw_df.columns: 
    print(col) 

In [None]:
replaced_df = raw_df.replace(to_replace="?", value="NA")
replaced_df.head()

In [None]:
for cols in replaced_df.select_dtypes(exclude=np.number).columns:
    print(cols, ':', replaced_df[cols].unique(), end='\n\n')


# Over and under 50k dataframes

In [None]:
over_50k_df = replaced_df[replaced_df["income"] == ">50K"]
over_50k_df.shape

In [None]:
under_50k_df = replaced_df[replaced_df["income"] == "<=50K"]
under_50k_df.shape

# Gender

In [None]:
gender_over_50k = over_50k_df["sex"].value_counts()

In [None]:
gender_over_50k.plot(kind="pie", autopct="%1.1f%%")
plt.title("Over 50k Income by Gender")
plt.ylabel("")
plt.show()

In [None]:
gender_under_50k = replaced_df["sex"][replaced_df["income"] == "<=50K"].value_counts()

In [None]:
gender_under_50k.plot(kind="pie", autopct="%1.1f%%")
plt.title("Under 50k Income by Gender")
plt.ylabel("")
plt.show()

# Education

In [None]:
education_over_50k = replaced_df["education.num"][replaced_df["income"] == ">50K"]
education_under_50k = replaced_df["education.num"][replaced_df["income"] == "<=50K"]

## With outliers

In [None]:
graph = [education_over_50k, education_under_50k]
labels = ["Over 50k income", "Under 50k income"]
fig1, ax1 = plt.subplots()
ax1.set_title("Educational attainment for over and under 50k income")
ax1.set_ylabel("Educational attainment")
ax1.boxplot(graph, labels=labels)
plt.show()

In [None]:
stats.ttest_ind(education_over_50k, education_under_50k, equal_var=False)

## Outliers removed

In [None]:
quartiles = education_over_50k.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Education values below {round(lower_bound, 2)} and above \
{round(upper_bound, 2)} could be outliers.")

In [None]:
outliers_below = over_50k_df[over_50k_df["education.num"] < lower_bound].index.tolist()
print(outliers_below)

In [None]:
over_50k_clean_df=over_50k_df.drop(outliers_below)
over_50k_clean_df.shape

In [None]:
outliers_above = over_50k_clean_df[over_50k_clean_df["education.num"] > upper_bound].index.tolist()
print(outliers_above)

In [None]:
quartiles1 = education_under_50k.quantile([.25,.5,.75])
lowerq1 = quartiles1[0.25]
upperq1 = quartiles1[0.75]
iqr1 = upperq1-lowerq1
lower_bound1 = lowerq1 - (1.5*iqr1)
upper_bound1 = upperq1 + (1.5*iqr1)
print(f"Education values below {round(lower_bound1, 2)} and above \
{round(upper_bound1, 2)} could be outliers.")

In [None]:
outliers_below1 = under_50k_df[under_50k_df["education.num"] < lower_bound1].index.tolist()
print(outliers_below1)

In [None]:
under_50k_clean_df=under_50k_df.drop(outliers_below1)
under_50k_clean_df.shape

In [None]:
outliers_above1 = under_50k_clean_df[under_50k_clean_df["education.num"] > upper_bound1].index.tolist()
print(outliers_above1)

In [None]:
under_50k_clean_df=under_50k_clean_df.drop(outliers_above1)
under_50k_clean_df.shape

In [None]:
education_over_50k_clean = over_50k_clean_df["education.num"]
education_under_50k_clean = under_50k_clean_df["education.num"]

In [None]:
graph = [education_over_50k_clean, education_under_50k_clean]
labels = ["Over 50k income", "Under 50k income"]
fig1, ax1 = plt.subplots()
ax1.set_title("Educational attainment for over and under 50k income")
ax1.set_ylabel("Educational attainment")
ax1.boxplot(graph, labels=labels)
plt.show()

In [None]:
stats.ttest_ind(education_over_50k_clean, education_under_50k_clean, equal_var=False)

# Here is the box plot I used in the group presentation

# Workhours

In [None]:
workhours_over_50k = replaced_df["hours.per.week"][replaced_df["income"] == ">50K"]
workhours_under_50k = replaced_df["hours.per.week"][replaced_df["income"] == "<=50K"]

In [None]:
graph = [workhours_over_50k, workhours_under_50k]
labels = ["Over 50k income", "Under 50k income"]
fig1, ax1 = plt.subplots()
ax1.set_title("Hours worked per week for over and under 50k income")
ax1.set_ylabel("Hours worked per week")
ax1.boxplot(graph, labels=labels)
plt.show()

In [None]:
stats.ttest_ind(workhours_over_50k, workhours_under_50k, equal_var=False)

# Age

In [None]:
replaced_df["age bins"] = pd.cut(replaced_df["age"], bins=[0,20,30,40,50,60,70,80,90,100])

In [None]:
age_over_50k = over_50k_df["age"]
age_under_50k = under_50k_df["age"]

In [None]:
graph = [age_under_50k, age_over_50k]
labels = ["Under 50k income", "Over 50k income"]
fig1, ax1 = plt.subplots()
ax1.set_title("Age for under and over 50k income")
ax1.set_ylabel("Age")
ax1.boxplot(graph, labels=labels)
plt.show()

In [None]:
stats.ttest_ind(age_over_50k, age_under_50k, equal_var=False)

# Age and Education correlation

In [None]:
age = replaced_df["age"]

In [None]:
education = replaced_df["education.num"]

In [None]:
x_values = age
y_values = education
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
print(f"The r-squared is: {rvalue**2}")
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(70,2),fontsize=15,color="red")
plt.title("Age vs. Educational Attainment")
plt.xlabel("Age")
plt.ylabel("Educational Attainment")
plt.show()

# Workplace type employment

In [None]:
workclass_over_50k = over_50k_df["workclass"].value_counts()

In [None]:
workclass_over_50k.plot(kind="pie", autopct="%1.1f%%")
plt.title("Over 50k Income by Workclass")
plt.ylabel("")
plt.show()

In [None]:
workclass_under_50k = under_50k_df["workclass"].value_counts()

In [None]:
workclass_under_50k.plot(kind="pie", autopct="%1.1f%%")
plt.title("Under 50k Income by Workclass")
plt.ylabel("")
plt.show()

In [None]:
private_income = replaced_df["income"][replaced_df["workclass"] == "Private"].value_counts()

# Income by workclass

## Private sector income

In [None]:
private_income.plot(kind="pie", autopct="%1.1f%%")
plt.title("Private workclass income")
plt.ylabel("")
plt.show()

## Local gov income

In [None]:
local_gov_income = replaced_df["income"][replaced_df["workclass"] == "Local-gov"].value_counts()

In [None]:
local_gov_income.plot(kind="pie", autopct="%1.1f%%")
plt.title("Local Gov workclass income")
plt.ylabel("")
plt.show()