# What is the average PhD stipend at Oxford University?

*Step 1: Import Python packages and define helper functions*

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

def return_avg_result(dataframe, university, department, year, measurement):
    new_df = pd.DataFrame()
    dataframe = dataframe[dataframe['Academic Year'].isin([year])]
    dataframe = dataframe[dataframe['University'].isin([university])]
    smaller_dataframe = dataframe[dataframe['Department'].isin([department])]
    new_df.loc[university+' All Departments'+' '+YEAR,'mean'] = dataframe.loc[:,measurement].mean(axis=0)
    new_df.loc[university+' All Departments'+' '+YEAR,'std'] = dataframe.loc[:,measurement].std(axis=0)
    new_df.loc[university+' All Departments'+' '+YEAR,'count'] = dataframe.loc[:,measurement].shape[0]
    new_df.loc[university+' '+department+' Department'+' '+YEAR,'mean'] = smaller_dataframe.loc[:,measurement].mean(axis=0)
    new_df.loc[university+' '+department+' Department'+' '+YEAR,'std'] = smaller_dataframe.loc[:,measurement].std(axis=0)
    new_df.loc[university+' '+department+' Department'+' '+YEAR,'count'] = smaller_dataframe.loc[:,measurement].shape[0]
    #print(measurement+' at '+university+' in '+year+':\n')
    return new_df

def return_popular_universities(dataframe,number_of_values):
    popular_universities = pd.DataFrame(dataframe['University'].value_counts()[1:number_of_values])
    #print('Number of Records Per University (Top '+str(number_of_values)+'):\n')
    return popular_universities

def return_popular_departments(dataframe,university,number_of_values):
    dataframe = dataframe[dataframe['University'].isin([university])]
    popular_departments = pd.DataFrame(dataframe['Department'].value_counts()[0:number_of_values])
    #print('Number of Records Per Department at '+UNIVERSITY+' (Top '+str(number_of_values)+'): \n')
    return popular_departments

*Step 2: Load and preview the data*

In [2]:
PHD_STIPENDS = pd.read_csv('/kaggle/input/phd-stipends/csv') # load the data
PHD_STIPENDS['Overall Pay'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'') # remove $ sign from column
PHD_STIPENDS['Overall Pay'] = PHD_STIPENDS['Overall Pay'].astype(float).fillna(0) # convert column to float
PHD_STIPENDS[['University','Department','Overall Pay','LW Ratio']].head(10) # preview the data

Unnamed: 0,University,Department,Overall Pay,LW Ratio
0,"City, University of London",Mathematics,16000.0,
1,University of Warwick (UK),Chemistry,19685.0,
2,University of Oxford,Chemistry,19700.0,
3,"Birkbeck, University of London",Biological sciences,20000.0,
4,Ohio State University (OSU),Electrical and Computer Engineering,23972.0,1.19
5,University of Birmingham,PhD metallurgy and Materials,19721.0,
6,Laval university,Forestry,18000.0,
7,Massachusetts Institute of Technology (MIT),Urban Studies and Planning,24500.0,0.88
8,University of Tokyo,Computer Science,36000.0,
9,University College London,Physics,22336.0,


In [3]:
df = return_popular_universities(PHD_STIPENDS,number_of_values = 10)

df.reset_index(level=0, inplace=True)
df.columns=['University','Number of Records']
fig = px.bar(df, x='University', y="Number of Records",title='Number of Records Per University (Top 10)')
fig.update(layout=dict(xaxis_title='University',yaxis_title='Number of Records',legend_orientation="h",showlegend=True))
fig.update_yaxes(range=[0,140])
fig.show()

*Step 3: Visualize the data for Oxford University only*

In [4]:
UNIVERSITY = 'University of Oxford'
df = return_popular_departments(PHD_STIPENDS, UNIVERSITY,number_of_values = 10)

df.reset_index(level=0, inplace=True)
df.columns=['Department','Number of Records']
fig = px.bar(df, x='Department', y="Number of Records",title='Number of Records Per Department at '+UNIVERSITY+' (Top 10)')
fig.update(layout=dict(xaxis_title='Department',yaxis_title='Number of Records',legend_orientation="h",showlegend=True))
fig.update_yaxes(range=[0,2])
fig.show()

In [5]:
UNIVERSITY = 'University of Oxford'
YEAR = '2019-2020'
MEASUREMENT = 'Overall Pay'
DEPARTMENT = 'Chemistry'
df = return_avg_result(PHD_STIPENDS, UNIVERSITY, DEPARTMENT, YEAR, MEASUREMENT)

df.reset_index(level=0, inplace=True)
df.columns=['Cohort','Avg','Std','n']
fig = px.bar(df, x='Cohort', y="Avg",error_y="Std",title='Average Overall Pay at '+UNIVERSITY)
fig.update(layout=dict(xaxis_title='Cohort',yaxis_title='Average Overall Pay',legend_orientation="h",showlegend=True))
fig.update_yaxes(range=[0,25000])
fig.show()