In [75]:
import requests
import pandas as pd
import altair as alt
import json

from sklearn.preprocessing import MultiLabelBinarizer

## Connecting to MongoDB and loading Underdog Devs data into dataframes

In [99]:
mentees = pd.DataFrame(requests.post("http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentees/read").json()["result"])[['tech_stack']]
mentors = pd.DataFrame(requests.post("http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentors/read").json()["result"])[['tech_stack']]

mentees['user_role'] = 'Mentee'
mentors['user_role'] = 'Mentor'

In [52]:
mentees.head()

Unnamed: 0,tech_stack,user_role
0,iOS,Mentee
1,Frontend,Mentee
2,Backend,Mentee
3,Android,Mentee
4,Career Development,Mentee


In [53]:
mentors.head()

Unnamed: 0,tech_stack,user_role
0,"[Design UI/UX, Android, Career Development]",Mentor
1,[Frontend],Mentor
2,"[Frontend, Android]",Mentor
3,"[Design UI/UX, Android, Frontend]",Mentor
4,[Design UI/UX],Mentor


## EDA and wrangling

### Removing placeholder observations

As the database is still populated with  mock data, there are some "string" placeholders that should be cleaned up

In [91]:
mentors = mentors[:20]

In [56]:
mentees["tech_stack"].value_counts()

Backend               21
Data Science          20
Career Development    18
iOS                   14
Frontend              11
string                11
Design UI/UX          10
Android                6
st4ing                 2
Name: tech_stack, dtype: int64

Filtering out where `mentees tech_stack` is either `"string"` or `"st4ing"`

In [100]:
mask = mentees["tech_stack"] != "st4ing"
mentees = mentees[mask]

In [101]:
mask = mentees["tech_stack"] != "string"
mentees = mentees[mask]

###  Checking data types in tech_stack feature



In [65]:
for i, l in enumerate(mentees['tech_stack'][:10]):
    print("list",i,"is",type(l))

list 0 is <class 'str'>
list 1 is <class 'str'>
list 2 is <class 'str'>
list 3 is <class 'str'>
list 4 is <class 'str'>
list 5 is <class 'str'>
list 6 is <class 'str'>
list 7 is <class 'str'>
list 8 is <class 'str'>
list 9 is <class 'str'>


In [66]:
for i, l in enumerate(mentors['tech_stack'][:10]):
    print("list",i,"is",type(l))

list 0 is <class 'list'>
list 1 is <class 'list'>
list 2 is <class 'list'>
list 3 is <class 'list'>
list 4 is <class 'list'>
list 5 is <class 'list'>
list 6 is <class 'list'>
list 7 is <class 'list'>
list 8 is <class 'list'>
list 9 is <class 'list'>


`mentees` contains strings, `mentors` contains lists. This will need to be resolved so that they are the same data type.

### Checking for null values

In [67]:
mentees.isnull().sum()

tech_stack    0
user_role     0
dtype: int64

In [68]:
mentors.isnull().sum()

tech_stack    0
user_role     0
dtype: int64

There are no null values in either data frame.

### Converting `mentee` tech_stack strings to lists

`mentee tech_stack` objects must be converted to lists in order to work with sklearn's `MultiLabelBinarizer`

In [102]:
def put_string_into_list(stack):
    return [stack]

stack = 'Lorem ipsum'
put_string_into_list(stack)

['Lorem ipsum']

In [103]:
mentees['tech_stack'] = mentees['tech_stack'].apply(put_string_into_list)

In [104]:
mentees.head()

Unnamed: 0,tech_stack,user_role
0,[iOS],Mentee
1,[Frontend],Mentee
2,[Backend],Mentee
3,[Android],Mentee
4,[Career Development],Mentee


Double checking `tech_stack` data types

In [72]:
for i, l in enumerate(mentees['tech_stack'][:10]):
    print("list",i,"is",type(l))

list 0 is <class 'list'>
list 1 is <class 'list'>
list 2 is <class 'list'>
list 3 is <class 'list'>
list 4 is <class 'list'>
list 5 is <class 'list'>
list 6 is <class 'list'>
list 7 is <class 'list'>
list 8 is <class 'list'>
list 9 is <class 'list'>


### Creating one-hot encoded Mentee and Mentor dataframes

##### Mentees data frame

In [105]:
mlb = MultiLabelBinarizer()
mentees_encoded = mentees.join(pd.DataFrame(mlb.fit_transform(
                                                mentees['tech_stack']),
                                                columns=mlb.classes_,
                                                index=mentees.index))

In [106]:
mentees_encoded.head()

Unnamed: 0,tech_stack,user_role,Android,Backend,Career Development,Data Science,Design UI/UX,Frontend,iOS
0,[iOS],Mentee,0,0,0,0,0,0,1
1,[Frontend],Mentee,0,0,0,0,0,1,0
2,[Backend],Mentee,0,1,0,0,0,0,0
3,[Android],Mentee,1,0,0,0,0,0,0
4,[Career Development],Mentee,0,0,1,0,0,0,0


Adjust `user_role` column title for better visualization display

In [107]:
mentees_encoded.rename(columns={"user_role": "User Role"}, inplace=True)

In [108]:
mentees_encoded.head()

Unnamed: 0,tech_stack,User Role,Android,Backend,Career Development,Data Science,Design UI/UX,Frontend,iOS
0,[iOS],Mentee,0,0,0,0,0,0,1
1,[Frontend],Mentee,0,0,0,0,0,1,0
2,[Backend],Mentee,0,1,0,0,0,0,0
3,[Android],Mentee,1,0,0,0,0,0,0
4,[Career Development],Mentee,0,0,1,0,0,0,0


##### Mentors data frame

In [94]:
mentors_encoded = mentors.join(pd.DataFrame(mlb.fit_transform(mentors['tech_stack']),
                          columns=mlb.classes_,
                          index=mentors.index))

In [96]:
mentors_encoded.head()

Unnamed: 0,tech_stack,user_role,Android,Backend,Career Development,Data Science,Design UI/UX,Frontend,iOS
0,"[Design UI/UX, Android, Career Development]",Mentor,1,0,1,0,1,0,0
1,[Frontend],Mentor,0,0,0,0,0,1,0
2,"[Frontend, Android]",Mentor,1,0,0,0,0,1,0
3,"[Design UI/UX, Android, Frontend]",Mentor,1,0,0,0,1,1,0
4,[Design UI/UX],Mentor,0,0,0,0,1,0,0


Adjust `user_role` column title for better visualization display

In [97]:
mentors_encoded.rename(columns={"user_role": "User Role"}, inplace=True)

In [109]:
mentors_encoded.head()

Unnamed: 0,tech_stack,User Role,Android,Backend,Career Development,Data Science,Design UI/UX,Frontend,iOS
0,"[Design UI/UX, Android, Career Development]",Mentor,1,0,1,0,1,0,0
1,[Frontend],Mentor,0,0,0,0,0,1,0
2,"[Frontend, Android]",Mentor,1,0,0,0,0,1,0
3,"[Design UI/UX, Android, Frontend]",Mentor,1,0,0,0,1,1,0
4,[Design UI/UX],Mentor,0,0,0,0,1,0,0


### Getting Sum Totals

In [110]:
mentees_encoded.sum()

tech_stack            [iOS, Frontend, Backend, Android, Career Devel...
User Role             MenteeMenteeMenteeMenteeMenteeMenteeMenteeMent...
Android                                                               6
Backend                                                              21
Career Development                                                   18
Data Science                                                         20
Design UI/UX                                                         10
Frontend                                                             11
iOS                                                                  14
dtype: object

In [111]:
mentors_encoded.sum()

tech_stack            [Design UI/UX, Android, Career Development, Fr...
User Role             MentorMentorMentorMentorMentorMentorMentorMent...
Android                                                               7
Backend                                                               2
Career Development                                                    5
Data Science                                                          6
Design UI/UX                                                          5
Frontend                                                              5
iOS                                                                   3
dtype: object

Create a data frame with Mentee sums

In [115]:
mentees_stack_sums = pd.DataFrame(mentees_encoded.sum())
mentees_stack_sums.reset_index(inplace=True)
mentees_stack_sums.rename(columns={"index": "Tech Stack", 0: "Count"}, inplace=True)
mentees_stack_sums.drop(index=[0, 1], inplace=True)
mentees_stack_sums["User Role"] = "Mentee"

mentees_stack_sums

Unnamed: 0,Tech Stack,Count,User Role
2,Android,6,Mentee
3,Backend,21,Mentee
4,Career Development,18,Mentee
5,Data Science,20,Mentee
6,Design UI/UX,10,Mentee
7,Frontend,11,Mentee
8,iOS,14,Mentee


Create a data frame with Mentor sums

In [114]:
mentors_stack_sums = pd.DataFrame(mentors_encoded.sum())
mentors_stack_sums.reset_index(inplace=True)
mentors_stack_sums.rename(columns={"index": "Tech Stack", 0: "Count"}, inplace=True)
mentors_stack_sums.drop(index=[0, 1], inplace=True)
mentors_stack_sums["User Role"] = "Mentor"

mentors_stack_sums

Unnamed: 0,Tech Stack,Count,User Role
2,Android,7,Mentor
3,Backend,2,Mentor
4,Career Development,5,Mentor
5,Data Science,6,Mentor
6,Design UI/UX,5,Mentor
7,Frontend,5,Mentor
8,iOS,3,Mentor


Create the final data frame to use for visualizations

In [116]:
tech_stack_sums = pd.concat([mentees_stack_sums, mentors_stack_sums], axis=0)
tech_stack_sums

Unnamed: 0,Tech Stack,Count,User Role
2,Android,6,Mentee
3,Backend,21,Mentee
4,Career Development,18,Mentee
5,Data Science,20,Mentee
6,Design UI/UX,10,Mentee
7,Frontend,11,Mentee
8,iOS,14,Mentee
2,Android,7,Mentor
3,Backend,2,Mentor
4,Career Development,5,Mentor


# Final Visualization

In [118]:
alt.Chart(tech_stack_sums, title="Tech Stack Counts by Role").mark_bar(size=30).encode(
    column=alt.Column('Tech Stack'),
    x=alt.X('User Role', axis=alt.Axis(labels=False, title=None)),
    y=alt.Y('Count', scale=alt.Scale(domain=(0, (tech_stack_sums["Count"].max() + 1)))),
    color=alt.Color('User Role', scale=alt.Scale(scheme="dark2")),
    tooltip=alt.Tooltip(list(tech_stack_sums.columns)[1:])).properties(width=100).configure_title(fontSize=18)