We chose to use plotly to handle our data visualizations and interactive abilities.

In [1]:
import pandas as pd
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

Read in placement data.

In [2]:
df = pd.read_csv('cleaned_data.csv')

Data manipulation/processing to be used for plots.

In [54]:
# numerical indicator for secondary specialization group
df['hses_code'] = [(1 if x=="Arts" else 2 if x=='Commerce' else 3) for x in df['Specialization in Higher Secondary Education']]

# numerical indicator for gender
df['g_code'] = [(0 if x == "M" else 1) for x in df['Gender']]

# count of students in secondary specializaiton groups
all_special_count = df.groupby('Specialization in Higher Secondary Education').count()

# total salaries offered to students in specialization groups
all_special_salary = df.groupby('Specialization in Higher Secondary Education').sum()

# remove students not placed in jobs
df_placed = df.loc[df['Placement Status']=='Placed']

#group placed students by secondary specialization
group_by_special = df_placed.loc[:,['Specialization in Higher Secondary Education', 'Gender','Degree Type','Work Experience','Salary']].groupby('Specialization in Higher Secondary Education')

# counts of placed students in secondary specialization groups
special_count = group_by_special.count()

# total salaries offered to placed students in secondary specialization groups
special_salary = group_by_special.sum()

all_special_salary.head()

Unnamed: 0_level_0,Secondary Education Percentage (10th Grade),Higher Secondary Education Percentage (12th Grade),Degree Percentage,MBA percentage,Salary,hses_code,g_code
Specialization in Higher Secondary Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Arts,640.0,690.2,676.23,686.23,19738.07,11,6
Commerce,7487.78,7796.9,7481.92,7006.56,300383.68,226,40
Science,6342.45,5774.53,6111.44,5697.02,245045.64,273,30


First visuals displaying student grouping and salaries for those who received job offers in the secondary specialization groups.

In [61]:
# comparing different secondary education specializations.
labels = ['Arts','Commerce','Science']
fig = make_subplots(rows=2, cols=2,specs=[[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels,
                             values=all_special_count.loc[:,'Gender'], name='All Students'), row = 1, col = 1)
fig.add_trace(go.Pie(labels=labels,
                             values=all_special_salary.loc[:,'Salary'], name = 'Salaries'), row = 1, col = 2)
fig.add_trace(go.Pie(labels=labels,
                             values=special_count.loc[:,'Gender'], name='Placed Students'), row = 2, col = 1)
fig.update_traces(hole=.4, hoverinfo="label+percent+value+name")
fig.update_layout(title_text='Higher Secondary Education Specialization',title_font_size=25,
                annotations=[dict(text='All Students', x=0.16, y=1.1, font_size=15,showarrow=False),
                             dict(text='Salaries', x=0.82, y=1.1, font_size=15, showarrow=False),
                             dict(text='Placed Students', x=0.14, y=0.5, font_size=15,showarrow=False)
                            ])
fig.show()

Following cell creates dataframes. This time with the counts of male, female, work experence, no work experience, placed, and unplaced for all of the secondary specialization groups.

In [7]:
# males in each of specialization groups
df_m = df.loc[df['Gender']=='M'].groupby('Specialization in Higher Secondary Education').count()

# females in each of specialization groups
df_f = df.loc[df['Gender']=='F'].groupby('Specialization in Higher Secondary Education').count()

# number with work experience in each group
df_y = df.loc[df['Work Experience']=='Yes'].groupby('Specialization in Higher Secondary Education').count()

# number without work experience in each group
df_n = df.loc[df['Work Experience']=='No'].groupby('Specialization in Higher Secondary Education').count()

# number with job offers in each group
df_p = df.loc[df['Placement Status']=='Placed'].groupby('Specialization in Higher Secondary Education').count()

# number without job offers in each group
df_np = df.loc[df['Placement Status']=='Not Placed'].groupby('Specialization in Higher Secondary Education').count()

Second visual displaying various categorical factors about the students the make up each secondary specialization group.

In [8]:
# job offer comparison
y_m = df_m['Gender']
y_f = df_f['Gender']
y_y = df_y['Gender']
y_n = df_n['Gender']
y_p = df_p['Gender']
y_np = df_np['Gender']
fig = go.Figure()
fig.add_trace(go.Bar(x=labels, y=y_m, name="Male", marker_color="darkblue", width=0.2))
fig.add_trace(go.Bar(x=labels, y=y_f, name="Female", marker_color="crimson", width=0.2))

# COLORS WONT CHANGE

fig.update_layout(barmode='stack',
                  xaxis = dict(tickvals = labels),
                  xaxis_tickangle=-45,
                  updatemenus=[dict(buttons=list([
                      dict(
                          args=[{"y":[y_m,y_f],"name":['Male','Female']}],
                          label = 'Gender',
                          method='update'
                      ),
                      dict(
                          args=[{"y":[y_y,y_n],"name":['Yes','No']}],
                          label = 'Work Experience',
                          method='update'
                      ),
                      dict(
                          args=[{"y":[y_p,y_np],"name":['Yes','No']}],
                          label = 'Job Offer',
                          method='update'
                      )]),              
                      direction="down",active=0, showactive=True
                                                        )])

Data formatting to differentiate between males and females and their categorical data in relation to salary.

In [9]:
females = df.loc[df['Gender']=='F']
males = df.loc[df['Gender']=='M']

 Third visual displaying male and female salaries in relation to categorical data. 
 
NOTE: The overlap of the smaller crimson circles and larger transparent gold circles causes it to appear as if there are another group of small orange circles. This is not the case.

In [10]:
f_salary = females['Salary']
f_MBA = females['MBA Specialization']
f_experience = females['Work Experience']
f_degree = females['Degree Type']
f_specialization = females['Specialization in Higher Secondary Education']

m_salary = males['Salary']
m_MBA = males['MBA Specialization']
m_experience = males['Work Experience']
m_degree = males['Degree Type']
m_specialization = males['Specialization in Higher Secondary Education']

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=f_salary,
    y=f_MBA,
    marker=dict(color="crimson", size=10),
    mode="markers",
    name="Women",
))

fig.add_trace(go.Scatter(
    x=m_salary,
    y=m_MBA,
    marker=dict(color="gold", size=15),
    mode="markers",
    opacity= 0.5,
    name="Men",
))

fig.update_traces(marker=dict(line=dict(width=0.5,
                              color='DarkSlateGrey')),
                              selector=dict(mode='markers')
                 )

fig.update_layout(title="Gender Earnings",
                  xaxis_title="Annual Salary (in thousands)",
                  updatemenus=[dict(buttons=list([
                      dict(
                          args=[{"y":[f_MBA,m_MBA]}],
                          label= 'MBA Specialization',
                          method='update'
                      ),
                      dict(
                          args=[{"y":[f_experience,m_experience]}],
                          label= 'Work Experience',
                          method='update'
                      ),
                      dict(
                          args=[{"y":[f_degree,m_degree]}],
                          label= 'Degree Type',
                          method='update'
                      ),
                      dict(
                          args=[{"y":[f_specialization,m_specialization]}],
                          label= 'Specialization in Higher Sec. Edu.',
                          method='update'
                      )]),              
                      direction="down",active=0, showactive=True
                                                        )])
fig.show()

Final visual displaying numerical/academic data for all students and salaries offered. Salaries of 0 represent that they were not offered a job. Colorscale values are mapped from the salaries offered.

In [39]:
# numeric/academic data for all students

fig = go.Figure(data=
    go.Parcoords(
        line = dict(color = df['Salary'],
                   colorscale = 'inferno',
                   showscale = True,
                    cmin = 0,
                    cmax = 7000
                   ),
        dimensions = list([
            dict(range = [35,100],
                 label = 'Secondary Edu. %age', values = df['Secondary Education Percentage (10th Grade)']),
            dict(range = [35,100], # pull range from possible ranks
                 label = 'Higher Secondary Edu. %age', values = df['Higher Secondary Education Percentage (12th Grade)']),
            dict(range = [35,100],
                 label = 'Degree %age', values = df['Degree Percentage']),
            dict(range = [35,100],
                 label = 'MBA Percentage', values = df['MBA percentage']),
            dict(range = [2000,7000],
                constraintrange = [1,df['Salary'].max()],
                label = 'Salary', values = df['Salary'])
            
        ])
    )
)
fig.update_layout(title_text='Student Education Percentage to Salary')
fig.show()