In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
#import squarify

### load and clean recent_grads and grad_students datasets

In [None]:
# recent_grads
recent_grads = pd.read_csv('data/recent-grads.csv')

to_drop = ['Rank','Full_time_year_round', 'Unemployment_rate','ShareWomen', 'P25th','P75th',
           'Sample_size']
recent_grads.drop(to_drop, axis=1, inplace=True)
recent_grads['Major_code'] = recent_grads['Major_code'].apply(str)

recent_grads.dropna(subset = ['Total'], inplace=True)
recent_grads.isna().sum()
recent_grads.astype({'Total': 'int64', 'Men': 'int64', 'Women': 'int64'}).dtypes

recent_grads = recent_grads.rename(columns={'Median': 'Median_income'})

# Convert major name to title case
recent_grads['Major'] = recent_grads['Major'].str.title()

# Get short names of majors
recent_grads['Total'] = recent_grads['Total'].apply(np.int64)
recent_grads['Men'] = recent_grads['Men'].apply(np.int64)
recent_grads['Women'] = recent_grads['Women'].apply(np.int64)
major_list = pd.read_csv('data/major_short_names.csv')
recent_grads = pd.merge(major_list, recent_grads, how='right')
recent_grads.rename(columns={'Major':'Major_long', 'Major_short':'Major', 'Low_wage_jobs':'income: low-wage'}, inplace=True)
recent_grads['income: other'] = recent_grads['Employed'] - recent_grads['income: low-wage']

In [None]:
recent_grads.head()

In [None]:
grad_students = pd.read_csv('data/grad-students.csv')

In [None]:
grad_students.columns

In [None]:
to_drop = ['Major_code', 'Grad_sample_size', 'Grad_P25', 'Grad_P75',  'Nongrad_P25', 'Nongrad_P75', 'Grad_share', 'Grad_premium', 'Nongrad_full_time_year_round']
grad_students.drop(to_drop, axis=1, inplace=True)
grad_students.isna().sum()

In [None]:
grad_students.rename(columns={ 'Grad_full_time_year_round': 'Grad_full_time','Nongrad_employed':'Grad_non_college_jobs', 'Grad_median':'Grad_median_income'}, inplace=True)
grad_students['Grad_part_time'] = grad_students['Grad_employed'] - grad_students['Grad_full_time']
grad_students['Grad_median_income'] = grad_students['Grad_median_income'].apply(np.int64)
grad_students.head()

### pivot tables for visualizations

#### recent_grads

In [None]:
recent_grads['Percent_employed'] = recent_grads['Employed'] / (recent_grads['Employed'] + recent_grads['Unemployed'])
recent_grads['Percent_college_jobs'] = recent_grads['College_jobs'] / (recent_grads['College_jobs'] + recent_grads['Non_college_jobs'])
recent_grads['Percent_non_college_jobs'] = recent_grads['Non_college_jobs'] / (recent_grads['College_jobs'] + recent_grads['Non_college_jobs'])
recent_grads['Total_income'] = recent_grads['Median_income'] * recent_grads['Full_time']
recent_grads['Percent_low_wage_jobs'] = recent_grads['income: low-wage'] / (recent_grads['College_jobs'] + recent_grads['Non_college_jobs'] + recent_grads['income: low-wage'])
recent_grads.head()

In [None]:
df_cat = recent_grads.groupby('Major_category').agg({'Median_income': 'mean', 'Total': 'sum', 'Men': 'sum', 'Women': 'sum',
                    'Employed': 'sum', 'Unemployed': 'sum', 'Full_time': 'sum', 'College_jobs': 'sum',
                    'Non_college_jobs': 'sum', 'income: low-wage': 'sum'})
df_cat.reset_index(level=0, inplace=True)

# See note above about the approximation of median income by category
#df_cat['Median_income'] = round(df_cat['Median_income'] / df_cat['Full_time'])
df_cat['Percent_employed'] = (df_cat['Employed'] / (df_cat['Employed'] + df_cat['Unemployed']))*100
df_cat['Percent_unemployed'] = (df_cat['Unemployed'] / (df_cat['Employed'] + df_cat['Unemployed']))*100
df_cat['Percent_men'] = (df_cat['Men'] / (df_cat['Men'] + df_cat['Women']))*100
df_cat['Percent_women'] = (df_cat['Women'] / (df_cat['Men'] + df_cat['Women']))*100
df_cat['Percent_college_jobs'] = (df_cat['College_jobs'] / (df_cat['College_jobs'] + df_cat['Non_college_jobs']))*100
df_cat['Percent_non_college_jobs'] = (df_cat['Non_college_jobs'] / (df_cat['College_jobs'] + df_cat['Non_college_jobs']))*100
df_cat['Percent_low_wage_jobs'] = df_cat['income: low-wage'] / (df_cat['College_jobs'] + df_cat['Non_college_jobs'] + df_cat['income: low-wage'])
df_cat.head()

### visualizations

<ol>
<li>gendered violin plot by income</li>
<li>layered bar chart income vs major category, legend: undergraduates, graduates</li> 
<li>dual-histograms for categories with large income change</li> 
<li>total students vs income (popularity/income correlation)</li>
<li>stand-out individual majors (outliers within categories)</li>
<li>income vs employment type (display risk/reward tradeoff)</li>
<li>category/major gender charts (distribution for categories)</li>
</ul>

In [None]:
# Income vs Popularity
sns.set(style='whitegrid')
bar1 = sns.lmplot(y='Median_income',
                  x='Total',
                  data=df_cat,
                  fit_reg=False,
                  height=5,
                  aspect=2
                  )

def label_point(variable, value, x_offset=0, y_offset=0):
    plt.text(df_cat.Total[df_cat[variable]==value] + x_offset,
             df_cat.Median_income[df_cat[variable]==value] + y_offset,
             value, 
             fontdict=dict(color='black', alpha=0.5, size=16))

label_point('Major_category', 'Business')
label_point('Major_category', 'Engineering')

label_point('Major_category', 'Education', y_offset=+1000)
label_point('Major_category', 'Psychology & Social Work', y_offset=-3000)
label_point('Major_category', 'Humanities & Liberal Arts')

#plt.title('Income by Popularity', fontsize=16)
plt.ylabel("Income (USD)", fontsize=16)
plt.ylim(0, 65000)

plt.xlabel("Number of graduates", fontsize=16)
plt.xlim(0, 1400000)
plt.xticks([2e5, 4e5, 6e5, 8e5, 10e5, 12e5], ['200k', '400k', '600k', '800k', '1000k', '1200k'])
plt.tick_params(labelsize=16)
plt.show()

In [None]:

# Transpose to create two rows (one per gender) for each major category
df_cat_gender = pd.melt(df_cat.drop(columns='Total').rename(columns={'Major_category':'Cat'}),
                        id_vars=['Cat', 'Median_income', 'Percent_women'], value_vars=['Men', 'Women'],
                        var_name='Gender', value_name='Total')
df_cat_gender.sort_values(['Percent_women', 'Cat', 'Gender'], ascending=[True, True, True], inplace=True)
df_cat_gender.head()

# Abbreviated category name
df_cat_gender['C'] = df_cat_gender.Cat.str.upper().str[:8]

# Abbreviate 'Men' and 'Women'
for i in range(len(df_cat_gender)):
    if df_cat_gender.loc[i,'Gender'] == 'Men':
        df_cat_gender.loc[i,'Gender'] = 'M'
    elif df_cat_gender.loc[i,'Gender'] == 'Women':
        df_cat_gender.loc[i,'Gender'] = 'W'        

In [None]:
# Array of pie charts, one chart per category
# Each pie shows percent of female vs male graduates
def pie(v, l, color=None):
    plt.pie(v, labels=l.values, startangle=90)

    
sns.set(font_scale=1.5)
g = sns.FacetGrid(df_cat_gender, col="C", col_wrap=6)
g.map(pie, 'Total', 'Gender')
g.set_axis_labels("", "")
plt.show()

In [None]:
# Income vs Percent of women
sns.set(style='whitegrid')
bar1 = sns.lmplot(y='Median_income',
                  x='Percent_women',
                  data=df_cat,
                  fit_reg=False,
                  height=5,
                  aspect=2
                  )

def label_point(variable, value, x_offset=0, y_offset=0):
    plt.text(df_cat.Percent_women[df_cat[variable]==value] + x_offset,
             df_cat.Median_income[df_cat[variable]==value] + y_offset,
             value, 
             fontdict=dict(color='black', alpha=0.5, size=16))
    
label_point('Major_category', 'Engineering', x_offset=+0.5, y_offset=-500)
label_point('Major_category', 'Computers & Mathematics', x_offset=-10, y_offset=+500)
label_point('Major_category', 'Education', x_offset=+0.5, y_offset=-500)
label_point('Major_category', 'Psychology & Social Work', x_offset=-10, y_offset=-3000)


#plt.title('Income by % Women', fontsize=16)
plt.tick_params(labelsize=16)

plt.ylabel("Income (USD)", fontsize=16)
plt.ylim(0, 65000)

plt.xlabel("% Women", fontsize=16)
plt.xlim(20, 90)

plt.show()


In [None]:
# Income vs % with job requiring college
sns.set_theme(style="white")
sns.relplot(x="Percent_college_jobs", y="Median_income", size="Total",
            sizes=(40, 400), alpha=.5, palette="muted",
            height=6, data=df_cat)