# Gender decoder based on use of words
This algorithm will focus on words that occured most amongst both male and female applicants. <br>
They will then in turn be used to predict a persons gender using these words.

## Preprocessing data

In [2]:
import pandas as pd
import re 
import math
df = pd.read_csv('resumesCleaned.csv', delimiter= ',')
df.head()

Unnamed: 0,name,gender,skills,experience,education,personal_qualities,femaleConfirmed,maleConfirmed
0,Amy,0,trained in accounting softwares aexeo agresso...,citco fund services singapore pte ltd jan pre...,royal melbourne institute of technology rmit j...,fund accountant with nearly years of experien...,0,0
1,Jonas,0,advanced as excel microsoft office outlook pow...,apr present year fund accountant citco fund ...,university of london bachelor s major banking ...,other information resourceful professional equ...,0,0
2,Lester,1,fluent english mandarin cantonese and hokkien,marketing client service executive aberdeen as...,recipient of the durham postgraduate award re...,market sector leader of the durham university ...,0,0
3,Magdalena,0,advanced compliance financial accounting,finance manager aberdeen asset management asia...,nanyang technological university bachelor s ma...,responsibilities specialised in performing ann...,0,0
4,Esther,0,advanced ifca investran mri ms office oracle o...,experience feb present years months senior ...,oxford brookes university bachelor s major b...,other information since graduation i have accu...,0,0


In [2]:
df.columns = ['person_name', 'person_gender', 'person_skills', 'person_experience', 'person_education', 'person_qualities', 'femaleConfirmed', 'maleConfirmed']

In [3]:
#get all words in qualities column as seperate columns so they can be counted
df = pd.concat([df,df.person_qualities.str.get_dummies(' ')],axis=1)
df

Unnamed: 0,person_name,person_gender,person_skills,person_experience,person_education,person_qualities,femaleConfirmed,maleConfirmed,a,aberdeen,...,yours,youth,yoy,yusoff,yusri,zeal,zealand,zest,zone,zones
0,Amy,0,trained in accounting softwares aexeo agresso...,citco fund services singapore pte ltd jan pre...,royal melbourne institute of technology rmit j...,fund accountant with nearly years of experien...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Jonas,0,advanced as excel microsoft office outlook pow...,apr present year fund accountant citco fund ...,university of london bachelor s major banking ...,other information resourceful professional equ...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Lester,1,fluent english mandarin cantonese and hokkien,marketing client service executive aberdeen as...,recipient of the durham postgraduate award re...,market sector leader of the durham university ...,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,Magdalena,0,advanced compliance financial accounting,finance manager aberdeen asset management asia...,nanyang technological university bachelor s ma...,responsibilities specialised in performing ann...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Esther,0,advanced ifca investran mri ms office oracle o...,experience feb present years months senior ...,oxford brookes university bachelor s major b...,other information since graduation i have accu...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,Kelly,0,ms office accounting system oracle peoplesoft ...,oct present mapletree commercial trust manage...,year jun sep institution sheffield hallam un...,able to work independently as an individual a...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
235,Winnie,0,singapore financial reporting standards malay...,designation accountant july present organisa...,professional qualification association of char...,an ambitious and enthusiastic chartered accoun...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
236,Monna Liza,0,microsoft office word advanced excel power po...,mc corporate service pte ltd coleman st the ...,bachelor of science in accountancy university...,excellent team building leadership communicat...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,Alvin,1,able to converse well in english mandarin and...,company mdr limited listed in singapore stock ...,i was my school representative for badminton c...,a competent professional who is capable of und...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#do the same with the words occuring in the skills column
df = pd.concat([df,df.person_skills.str.get_dummies(' ')],axis=1)
df

Unnamed: 0,person_name,person_gender,person_skills,person_experience,person_education,person_qualities,femaleConfirmed,maleConfirmed,a,aberdeen,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
0,Amy,0,trained in accounting softwares aexeo agresso...,citco fund services singapore pte ltd jan pre...,royal melbourne institute of technology rmit j...,fund accountant with nearly years of experien...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Jonas,0,advanced as excel microsoft office outlook pow...,apr present year fund accountant citco fund ...,university of london bachelor s major banking ...,other information resourceful professional equ...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Lester,1,fluent english mandarin cantonese and hokkien,marketing client service executive aberdeen as...,recipient of the durham postgraduate award re...,market sector leader of the durham university ...,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,Magdalena,0,advanced compliance financial accounting,finance manager aberdeen asset management asia...,nanyang technological university bachelor s ma...,responsibilities specialised in performing ann...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Esther,0,advanced ifca investran mri ms office oracle o...,experience feb present years months senior ...,oxford brookes university bachelor s major b...,other information since graduation i have accu...,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,Kelly,0,ms office accounting system oracle peoplesoft ...,oct present mapletree commercial trust manage...,year jun sep institution sheffield hallam un...,able to work independently as an individual a...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
235,Winnie,0,singapore financial reporting standards malay...,designation accountant july present organisa...,professional qualification association of char...,an ambitious and enthusiastic chartered accoun...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
236,Monna Liza,0,microsoft office word advanced excel power po...,mc corporate service pte ltd coleman st the ...,bachelor of science in accountancy university...,excellent team building leadership communicat...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,Alvin,1,able to converse well in english mandarin and...,company mdr limited listed in singapore stock ...,i was my school representative for badminton c...,a competent professional who is capable of und...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Analyzing resulting data

In [5]:
#make subsets based on gender so there are 2 different datasets 
subset_female = df[df['person_gender']==0]
subset_male = df[df['person_gender']==1]
subset_male

Unnamed: 0,person_name,person_gender,person_skills,person_experience,person_education,person_qualities,femaleConfirmed,maleConfirmed,a,aberdeen,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
2,Lester,1,fluent english mandarin cantonese and hokkien,marketing client service executive aberdeen as...,recipient of the durham postgraduate award re...,market sector leader of the durham university ...,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
6,Gaurav,1,proficient with the usage of excel word power...,accenture india subject matter expert team lea...,currently perusing diploma in ifrs from acca ...,goal oriented with excellent ability to manag...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,Ku,1,ms office erp systems peoplesoft cognos oracl...,mar to present chief financial officer cfo w...,jan to jun association of chartered certifie...,experiences more than years in full spectrum...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11,Nader,1,languages english and french spoken and writte...,al mal capital dubai uae sep current manager...,notre dame university beirut lebanon faculty o...,highly motivated and well connected business d...,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
14,NG,1,languages english and mandarin chinese native ...,allard partners limited us bn long only equity...,fudan university shanghai sep jul master of ...,interests reading soccer jogging and food i lo...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,Joshua,1,microsoft office macros and access english lan...,mar present associate at lion global investor...,sep aston university bsc hons international b...,strong leadership skills perform well under pr...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
227,Ben,1,in finance accounting audit and corporate offi...,head of finance various sme companies across d...,advance certificate in training assessment act...,success oriented with high energy dedicated an...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
230,Kyo,1,computer skills excel word power point sun sys...,present mainstay asia limited company descri...,present aicpa student member upper iowa uni...,hands on experience on financial reporting an...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,Alvin,1,able to converse well in english mandarin and...,company mdr limited listed in singapore stock ...,i was my school representative for badminton c...,a competent professional who is capable of und...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#get value count of most used words in female dataset
subset_female = subset_female.drop(['person_gender','person_name', 'person_skills', 'person_experience', 'person_education', 'person_qualities'], axis=1)
subset1 = subset_female.apply(pd.value_counts)
subset1

Unnamed: 0,femaleConfirmed,maleConfirmed,a,aberdeen,abilities,ability,able,about,abreast,absorb,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
0,101,111,29,114.0,111,82,88,111,114.0,112,...,112,114.0,114.0,113,112,113,110,112,114.0,113
1,13,3,85,,3,32,26,3,,2,...,2,,,1,2,1,4,2,,1


In [7]:
#get value count of most used words in female dataset
subset_male =  subset_male.drop(['person_gender','person_name', 'person_skills', 'person_experience', 'person_education', 'person_qualities'], axis=1)
subset2 = subset_male.apply(pd.value_counts)
subset2

Unnamed: 0,femaleConfirmed,maleConfirmed,a,aberdeen,abilities,ability,able,about,abreast,absorb,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
0,124,121,40,124,118,95,101,121,124,125.0,...,123,122,124,125.0,124,123,124,125.0,124,125.0
1,1,4,85,1,7,30,24,4,1,,...,2,3,1,,1,2,1,,1,


In [8]:
subset1 = subset1.drop([0])

In [9]:
subset1

Unnamed: 0,femaleConfirmed,maleConfirmed,a,aberdeen,abilities,ability,able,about,abreast,absorb,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
1,13,3,85,,3,32,26,3,,2,...,2,,,1,2,1,4,2,,1


In [10]:
subset2 = subset2.drop([0])
subset2

Unnamed: 0,femaleConfirmed,maleConfirmed,a,aberdeen,abilities,ability,able,about,abreast,absorb,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
1,1,4,85,1,7,30,24,4,1,,...,2,3,1,,1,2,1,,1,


In [11]:
subset1.fillna(0, inplace=True)
subset2.fillna(0, inplace=True)
subset1

Unnamed: 0,femaleConfirmed,maleConfirmed,a,aberdeen,abilities,ability,able,about,abreast,absorb,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
1,13,3,85,0.0,3,32,26,3,0.0,2,...,2,0.0,0.0,1,2,1,4,2,0.0,1


In [12]:
subtraction = subset2.subtract(subset1) 
subtraction

Unnamed: 0,femaleConfirmed,maleConfirmed,a,aberdeen,abilities,ability,able,about,abreast,absorb,...,xero,xp,xx,yahoo,yardi,year,years,york,your,youth
1,-12,1,0,1.0,4,-2,-2,1,1.0,-2.0,...,0,3.0,1.0,-1.0,-1,1,-3,-2.0,1.0,-1.0


In [13]:
import numpy as np

nlargest = 20
order = np.argsort(-subtraction.values, axis=1)[:, :nlargest]
result = pd.DataFrame(subtraction.columns[order], 
                      columns=['top{}'.format(i) for i in range(1, nlargest+1)],
                      index=subtraction.index)

print(result)

        top1        top2  top3 top4 top5       top6     top7       top8  \
1  bloomberg  experience  word   of  and  financial  through  financial   

         top9       top10 top11   top12   top13 top14 top15 top16   top17  \
1  analytical  investment    in  native  equity   the   his   for  driven   

      top18  top19    top20  
1  business  while  various  


  """


In [14]:
subtraction = subtraction.sort_values(by=1, ascending=False, axis=1)

## Which words are used most often by males and females? 
The table beneath this text shows the total words that are most often used. The words with the negative counts (at the bottom of the table) are most often used in female applicants texts, whereas the top words were most popular in male applicants texts.

In [15]:
from matplotlib import pyplot as plt
melt_df = pd.melt(subtraction, var_name='Cols')
melt_df

Unnamed: 0,Cols,value
0,bloomberg,22.0
1,experience,19.0
2,word,18.0
3,of,17.0
4,and,17.0
...,...,...
4549,independent,-11.0
4550,people,-12.0
4551,sap,-12.0
4552,femaleConfirmed,-12.0


In [16]:
melt_df.head(60)

Unnamed: 0,Cols,value
0,bloomberg,22.0
1,experience,19.0
2,word,18.0
3,of,17.0
4,and,17.0
5,financial,16.0
6,through,15.0
7,analytical,13.0
8,financial,13.0
9,investment,12.0


In [17]:
melt_df.tail(60)

Unnamed: 0,Cols,value
4494,cultural,-4.0
4495,mindset,-4.0
4496,people,-4.0
4497,more,-4.0
4498,pc,-4.0
4499,enjoy,-4.0
4500,degree,-4.0
4501,organised,-4.0
4502,table,-4.0
4503,sage,-4.0


In [39]:
#check how often 'he' is used
subtraction.he

1    8.0
Name: he, dtype: float64

## Predictive model
Random forest will be used for this predictive model as it works with word counts and therefore does not need to be vectorized again in order to use a NB model. As the input, the most relevant words were chosen from the table above this paragraph. Words that were a technology (such as Bloomberg) were left out.

In [40]:
y = df['person_gender'] #We need to take out the gender as our Y-variable
X = df[['femaleConfirmed','people', 'independent', 'player', 'multi', 'information', 'learn', 'drive', 'experience', 'accounting', 'quality', 'efficient', 'well', 'produce', 'provide', 'challenging', 'learning', 'she', 'hardworking', 'her', 'excellence', 'complex', 'mature', 'organized', 'netball', 'interpersonal', 'time', 'basketball', 'soccer', 'football', 'leadership', 'driven', 'business', 'badminton', 'equity', 'analytical', 'his', 'leadership', 'investment', 'proven' , 'keen', 'leader', 'individual', 'sports', 'independently', 'he']] #this slices the dataframe to include all rows I need
X = X.groupby(X.columns, axis=1).sum()
X

Unnamed: 0,accounting,analytical,badminton,basketball,business,challenging,complex,drive,driven,efficient,...,player,produce,proven,provide,quality,she,soccer,sports,time,well
0,1,1,0,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,2,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
3,2,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
235,2,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
236,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,2,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [42]:
from sklearn.ensemble import RandomForestClassifier
train_data_features=X_train
forest = RandomForestClassifier(n_estimators = 100, random_state=1) 
forest = forest.fit( train_data_features, y_train)
forest

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

## Accuracy

In [43]:
from sklearn.metrics import accuracy_score
result = forest.predict(X_test)
print(accuracy_score(y_test, result))

0.7083333333333334


In [44]:
train_data_features

Unnamed: 0,accounting,analytical,badminton,basketball,business,challenging,complex,drive,driven,efficient,...,player,produce,proven,provide,quality,she,soccer,sports,time,well
181,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
167,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
220,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
138,0,1,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
72,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
140,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
235,2,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
