In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
#import squarify

### load and clean recent_grads and grad_students datasets

In [None]:
# recent_grads
recent_grads = pd.read_csv('data/recent-grads.csv')

to_drop = ['Rank','Full_time_year_round', 'Unemployment_rate','ShareWomen', 'P25th','P75th',
           'Sample_size']
recent_grads.drop(to_drop, axis=1, inplace=True)
recent_grads['Major_code'] = recent_grads['Major_code'].apply(str)

recent_grads.dropna(subset = ['Total'], inplace=True)
recent_grads.isna().sum()
recent_grads.astype({'Total': 'int64', 'Men': 'int64', 'Women': 'int64'}).dtypes

recent_grads = recent_grads.rename(columns={'Median': 'Median_income'})

# Convert major name to title case
recent_grads['Major'] = recent_grads['Major'].str.title()

# Get short names of majors
recent_grads['Total'] = recent_grads['Total'].apply(np.int64)
recent_grads['Men'] = recent_grads['Men'].apply(np.int64)
recent_grads['Women'] = recent_grads['Women'].apply(np.int64)
major_list = pd.read_csv('data/major_short_names.csv')
recent_grads = pd.merge(major_list, recent_grads, how='right')
recent_grads.rename(columns={'Major':'Major_long', 'Major_short':'Major', 'Low_wage_jobs':'income: low-wage'}, inplace=True)
recent_grads['income: other'] = recent_grads['Employed'] - recent_grads['income: low-wage']

In [None]:
recent_grads.head()

In [None]:
grad_students = pd.read_csv('data/grad-students.csv')

In [None]:
grad_students.columns

In [None]:
to_drop = ['Major_code', 'Grad_sample_size', 'Grad_P25', 'Grad_P75',  'Nongrad_P25', 'Nongrad_P75', 'Grad_share', 'Grad_premium', 'Nongrad_full_time_year_round']
grad_students.drop(to_drop, axis=1, inplace=True)
grad_students.isna().sum()

In [None]:
grad_students.rename(columns={ 'Grad_full_time_year_round': 'Grad_full_time','Nongrad_employed':'Grad_non_college_jobs', 'Grad_median':'Grad_median_income'}, inplace=True)
grad_students['Grad_part_time'] = grad_students['Grad_employed'] - grad_students['Grad_full_time']
grad_students['Grad_median_income'] = grad_students['Grad_median_income'].apply(np.int64)
grad_students.head()

### pivot tables for visualizations

#### recent_grads

In [None]:
recent_grads['Percent_employed'] = recent_grads['Employed'] / (recent_grads['Employed'] + recent_grads['Unemployed'])
recent_grads['Percent_college_jobs'] = recent_grads['College_jobs'] / (recent_grads['College_jobs'] + recent_grads['Non_college_jobs'])
recent_grads['Percent_non_college_jobs'] = recent_grads['Non_college_jobs'] / (recent_grads['College_jobs'] + recent_grads['Non_college_jobs'])
recent_grads['Total_income'] = recent_grads['Median_income'] * recent_grads['Full_time']
recent_grads['Percent_low_wage_jobs'] = recent_grads['income: low-wage'] / (recent_grads['College_jobs'] + recent_grads['Non_college_jobs'] + recent_grads['income: low-wage'])
recent_grads.head()

In [None]:
df_cat = recent_grads.groupby('Major_category').agg({'Median_income': 'mean', 'Total': 'sum', 'Men': 'sum', 'Women': 'sum',
                    'Employed': 'sum', 'Unemployed': 'sum', 'Full_time': 'sum', 'College_jobs': 'sum',
                    'Non_college_jobs': 'sum', 'income: low-wage': 'sum'})
df_cat.reset_index(level=0, inplace=True)

# See note above about the approximation of median income by category
#df_cat['Median_income'] = round(df_cat['Median_income'] / df_cat['Full_time'])
df_cat['Percent_employed'] = (df_cat['Employed'] / (df_cat['Employed'] + df_cat['Unemployed']))*100
df_cat['Percent_unemployed'] = (df_cat['Unemployed'] / (df_cat['Employed'] + df_cat['Unemployed']))*100
df_cat['Percent_men'] = (df_cat['Men'] / (df_cat['Men'] + df_cat['Women']))*100
df_cat['Percent_women'] = (df_cat['Women'] / (df_cat['Men'] + df_cat['Women']))*100
df_cat['Percent_college_jobs'] = (df_cat['College_jobs'] / (df_cat['College_jobs'] + df_cat['Non_college_jobs']))*100
df_cat['Percent_non_college_jobs'] = (df_cat['Non_college_jobs'] / (df_cat['College_jobs'] + df_cat['Non_college_jobs']))*100
df_cat['Percent_low_wage_jobs'] = df_cat['income: low-wage'] / (df_cat['College_jobs'] + df_cat['Non_college_jobs'] + df_cat['income: low-wage'])
df_cat.head()

### visualizations

<ol>
<li>gendered violin plot by income</li>
<li>layered bar chart income vs major category, legend: undergraduates, graduates</li> 
<li>dual-histograms for categories with large income change</li> 
<li>total students vs income (popularity/income correlation)</li>
<li>stand-out individual majors (outliers within categories)</li>
<li>income vs employment type (display risk/reward tradeoff)</li>
<li>category/major gender charts (distribution for categories)</li>
</ul>

In [None]:
# Undergrad: Income vs total scatter plot, for each major category
sns.set(style='dark')
plt.figure(figsize=(10,6))
bar1 = sns.scatterplot(y='Median_income',
                   x='Total',
                   data=df_cat, 
                   color='darkblue',
                  )
plt.title('Income by Popularity', fontsize=16)
plt.ylabel("Income (USD)")
plt.xlabel("Number of graduates")
plt.show()


In [None]:
# Income vs Percent of women
sns.set(style='dark')
plt.figure(figsize=(10,6))
bar1 = sns.scatterplot(y='Median_income',
                   x='Percent_women',
                   data=df_cat, 
                   color='darkblue',
                  )
plt.title('Income by % Women', fontsize=16)
plt.ylabel("Income (USD)")
plt.xlabel("% Women")
plt.show()

In [None]:
sns.set_theme(style="white")
sns.relplot(x="Percent_women", y="Median_income", size="Total",
            sizes=(40, 400), alpha=.5, palette="muted",
            height=6, data=df_cat)

In [None]:
df_cat.columns

In [None]:
# Income vs % with job requiring college
sns.set_theme(style="white")
sns.relplot(x="Percent_college_jobs", y="Median_income", size="Total",
            sizes=(40, 400), alpha=.5, palette="muted",
            height=6, data=df_cat)

In [None]:
# undergraduates
sorted_categories1 = incomes.sort_values(by=['Median_income'], ascending=False)
# graduates
sorted_categories2 = incomes.sort_values(by=['Grad_median_income'], ascending=False)
# percent income change
sorted_categories3 = incomes.sort_values(by=['income_percent_change'], ascending=False)


In [None]:
sns.set(style='dark')
plt.figure(figsize=(10,6))
bar1 = sns.barplot(y='Major_category',
                   x='Median_income',
                   data=sorted_categories1, 
                   color='darkblue',
                  )

plt.axvline(x=incomes.Median_income.mean(),
        color='gray', lw=4)

plt.title('Income by Major Category', fontsize=16)
plt.ylabel(" ")
plt.xlabel("Mean Income (USD)")
plt.show()

In [None]:
# stacked bar chart/income

def plot_incomes(sort):
    sns.set(style='dark')
    plt.figure(figsize=(10,6))
    bar1 = sns.barplot(y='Major_category',
                       x='Grad_median_income',
                       data=sort, 
                       color='lightblue',
                      )
    bar2 = sns.barplot(y='Major_category',
                       x='Median_income', 
                       data=sort, 
                       color='darkblue',
                      )
    plt.axvline(x=incomes.Median_income.mean(),
            color='gray', lw=4)
    plt.axvline(x=incomes.Grad_median_income.mean(),
            color='red', lw=4)
    
    # add legend
    top_bar = mpatches.Patch(color='darkblue', label='Undergraduates')
    bottom_bar = mpatches.Patch(color='lightblue', label='Graduates')
    plt.legend(handles=[top_bar, bottom_bar])
    
    plt.title('Income by Major Category', fontsize=16)
    plt.ylabel(" ")
    plt.xlabel("Mean Income (USD)")
    plt.show()

In [None]:
plot_incomes(sorted_categories1)
plot_incomes(sorted_categories2)
plot_incomes(sorted_categories3)

In [None]:
sns.set(style='darkgrid')
# plt.figure(figsize=(12,12))

sns.histplot(x='income_percent_change', data=incomes, kde=True)
plt.show()

In [None]:
# initialize the figure
plt.figure(figsize=(6,6))
ax = plt.subplot(111, polar=True)
plt.axis('off')

# Set the coordinates limits
upperLimit = 130
lowerLimit = 50

# Compute max and min in the dataset
max = sorted_categories3['income_percent_change'].max()

# Let's compute heights: they are a conversion of each item value in those new coordinates
# In our example, 0 in the dataset will be converted to the lowerLimit (10)
# The maximum will be converted to the upperLimit (100)
slope = (max - lowerLimit) / max
heights = slope * sorted_categories3['income_percent_change'] + lowerLimit

# Compute the width of each bar. In total we have 2*Pi = 360Â°
width = 2*np.pi / len(sorted_categories3.Major_category)

# Compute the angle each bar is centered on:
indexes = list(range(1, len(sorted_categories3.Major_category)+1))
angles = [element * width for element in indexes]
angles

pal = sns.color_palette("Blues", len(sorted_categories3))
# rank = [int((max(array)-elem)*len(sorted_categories3)*0.75/(max(array)+1)) for elem in array] 

# Draw bars
bars = ax.bar(
    x=angles, 
    height=heights, 
    width=width, 
    bottom=lowerLimit,
    linewidth=2, 
    edgecolor="white",
#     color="#61a4b2",
    color=pal[::-1]
)

# little space between the bar and the label
labelPadding = 4

# Add labels
for bar, angle, height, label in zip(bars,angles, heights, sorted_categories3.Major_category):

    # Labels are rotated. Rotation must be specified in degrees :(
    rotation = np.rad2deg(angle)

    # Flip some labels upside down
    alignment = ""
    if angle >= np.pi/2 and angle < 3*np.pi/2:
        alignment = "right"
        rotation = rotation + 180
    else: 
        alignment = "left"

    # Finally add the labels
    ax.text(
        x=angle, 
        y=lowerLimit + bar.get_height() + labelPadding, 
        s=label, 
        ha=alignment, 
        va='center', 
        rotation=rotation, 
        rotation_mode="anchor") 

In [None]:
top_changed_cats = ['Biology & Life Science', 'Physical Sciences', 'Humanities & Liberal Arts']
bottom_changed_cats = ['Education', 'Arts', 'Engineering']

def get_violins(cats, wide=False):
    data1 = recent_grads[recent_grads.Major_category.isin(cats)]
    data2 = grad_students[grad_students.Major_category.isin(cats)]
    data1.rename(columns={'Median_income': 'Undergraduate Income'}, inplace=True)
    data2.rename(columns={'Grad_median_income': 'Graduate Income'}, inplace=True) 
    merged = pd.merge(data1, data2, on='Major_category', how='inner')
    data = pd.melt(merged, id_vars=['Major_category'], value_vars=['Undergraduate Income', 'Graduate Income'],
                   var_name='Group', value_name='Income')
    
    sns.set(style='darkgrid')
    my_pal = {'Undergraduate Income' : 'darkblue', 'Graduate Income': 'lightblue'}
    
    if wide==True:
        plt.figure(figsize=(18,12))
    else:
        plt.figure(figsize=(12,10))
        
    cat1 = sns.violinplot(x='Major_category', y='Income', hue='Group',
                          data=data,
                          palette=my_pal,
                          split=True,
                          inner='quartile',
                          scale='count', # scales density relative to counts across all bins
                          bw=.6 # amount of smoothing
                         )
    
    plt.xlabel(" ")
    plt.xticks(fontsize=16)
    plt.ylabel("Income (USD)", fontsize=14)
    plt.yticks(fontsize=16)
    plt.title("Income for Undergraduates/Graduates by Category", fontsize=18)

In [None]:
#def get_violins(cats, wide=False):
data = pd.melt(recent_grads, id_vars=['Major_category', 'Major', 'Median_income'], value_vars=['Men', 'Women'],
               var_name='Gender', value_name='Total')
data.tail()

In [None]:
sns.set(style='darkgrid')
my_pal = {'Men' : 'darkblue', 'Women' : 'lightblue'}

plt.figure(figsize=(12,10))

cat1 = sns.violinplot(x='Major_category', y='Median_income', hue='Gender',
                      data=data,
                      palette=my_pal,
                      split=True,
                      inner='quartile',
                      scale='count', # scales density relative to counts across all bins
                      bw=.6 # amount of smoothing
                     )

plt.xlabel(" ")
plt.xticks(fontsize=16)
plt.ylabel("Income (USD)", fontsize=14)
plt.yticks(fontsize=16)
plt.title("Income for Undergraduates/Graduates by Category", fontsize=18)

In [None]:
get_violins(top_changed_cats)
#get_violins(bottom_changed_cats)

In [None]:
get_violins(sorted_categories, True)
plt.xticks([])