# Data Wrangling
This data contains personality scores for recruits, plus the department they applied for at Umuzi.
Below I'll manipulate it with various functions to make it easy to view.

I begin by importing the relevant modules that we'll use.

In [22]:
import pandas as pd
import re
import numpy as np

Read the csv files.

In [2]:
df_dep= pd.read_csv('data/departments.csv', sep=';', index_col=0)
df_person= pd.read_csv('data/personality_scores.csv', sep=';')

drop any duplicate IDs.

In [3]:
df_person=df_person.drop_duplicates(subset='ID')


Make sure that te leng of both dataframes is equal. It will give an assertion error if not.

In [4]:
assert len(df_person)== len(df_dep)

create a function called scores which will be responsible for summing up the values in the given columns.

In [5]:

def scores(row, trait):
    """
    Takes in a row of a dataframe as well as a trait in the form of a digit.
    """
    total=0
    reg= re.compile('\d+')
    for col in row:
        numbers= reg.findall(col)
        if str(trait)==numbers[0]:
            total+= int(numbers[1])
            
    return total

#dictionary to link each number to a respective personal trait.
traits= dict({
    1 : 'Extraversion', 
    2 : 'Agreeableness',
    3 : 'Conscientiousness', 
    4 : 'Emotional Stability',
    5 : 'Intellect'})

#loop over each trait and add a column. calls the function 'scores' to be applied for each of those columns.
for each in traits.keys():
    df_person[f'{traits[each]}']= df_person.loc[:,'Section 5 of 6 [I am always prepared.]':'Section 5 of 6 [I worry about things.]'].apply(scores, args=(each,) , axis=1)

In [7]:
df_person.head()

Unnamed: 0,ID,Section 5 of 6 [I am always prepared.],Section 5 of 6 [I am easily disturbed.],Section 5 of 6 [I am exacting (demanding) in my work.],Section 5 of 6 [I am full of ideas.],Section 5 of 6 [I am interested in people.],Section 5 of 6 [I am not interested in abstract ideas.],Section 5 of 6 [I am not interested in other people's problems.],Section 5 of 6 [I am not really interested in others.],Section 5 of 6 [I am quick to understand things.],...,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,IPIP_HIGH_RISK,Extraversion,Agreeableness,Conscientiousness,Emotional Stability,Intellect
0,0,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 3)","(5, 3)","(2, 3)","(2, 5)","(5, 5)",...,,,,,,30,40,48,36,42
1,1,"(3, 5)","(4, 5)","(3, 5)","(5, 5)","(2, 5)","(5, 3)","(2, 5)","(2, 5)","(5, 5)",...,,,,,,42,46,46,40,42
2,2,"(3, 5)","(4, 3)","(3, 3)","(5, 5)","(2, 5)","(5, 5)","(2, 5)","(2, 5)","(5, 5)",...,,,,,,28,40,40,38,42
3,3,"(3, 5)","(4, 5)","(3, 3)","(5, 5)","(2, 5)","(5, 3)","(2, 3)","(2, 3)","(5, 3)",...,,,,,,30,38,38,40,38
4,4,"(3, 3)","(4, 5)","(3, 3)","(5, 3)","(2, 3)","(5, 3)","(2, 3)","(2, 3)","(5, 5)",...,,,,,,28,34,46,38,36


### merge the dataframes column-wise.

In [None]:
df_all= pd.concat([df_dep, df_person], axis=1)
df_all.head()

verify that indeed the length of the new dataframe is equal to any of the old ones.

In [10]:
assert len(df_all)== len(df_person)

### risk categorization

In [43]:
def risk(row):
    if row['Agreeableness']<30 and row['Conscientiousness']<30 and row['Emotional Stability']:
        return 'high risk'
    else:
        return 'low risk'
        

df_all['risk']= df_all[['Agreeableness', 'Conscientiousness', 'Emotional Stability']].apply(risk, axis=1)


In [44]:
df_high_risk= df_all[df_all.risk=='high risk']
df_high_risk[['Department', 'ID']]

Unnamed: 0,Department,ID
405,Strategy,405
716,Strategy,716
801,Strategy,801
881,Data,881
1197,Copywriting,1197


## High and low risk within each department. To be fixed.

In [63]:

df_all.groupby(['risk', 'Department']).aggregate('count').unstack().iloc[:,:5]

Unnamed: 0_level_0,ID,ID,ID,ID,ID
Department,Copywriting,Data,Design,Strategy,Web Dev
risk,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
high risk,1.0,1.0,,3.0,
low risk,325.0,328.0,120.0,446.0,170.0
