# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

from IPython.core.display import HTML
css = open('../../style-table.css').read() + open('../../style-notebook.css').read()
HTML('<style>%s</style>' % css)

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
users = pd.read_table(url, sep='|')
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [3]:
users.occupation.value_counts()

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
salesman          12
lawyer            12
none               9
doctor             7
homemaker          7
Name: occupation, dtype: int64

In [4]:
g = users[['age', 'occupation']].groupby('occupation').mean()
g = g.sort_values('age', ascending=False)
g

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
retired,63.071429
doctor,43.571429
educator,42.010526
healthcare,41.5625
librarian,40.0
administrator,38.746835
executive,38.71875
marketing,37.615385
lawyer,36.75
engineer,36.38806


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [5]:
g = users[['occupation', 'gender']]
g = g.groupby('occupation').agg({
    'gender': {
        'male': lambda x: np.sum(x == 'M'), 
        'female': lambda x: np.sum(x == 'F'),
    }
})
g[('gender', 'ratio')] = g[('gender', 'male')] / (g[('gender', 'male')] + g[('gender', 'female')])
g = g.sort_values(('gender', 'ratio'), ascending=False)
g

Unnamed: 0_level_0,gender,gender,gender
Unnamed: 0_level_1,male,female,ratio
occupation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
doctor,7,0,1.0
engineer,65,2,0.970149
technician,26,1,0.962963
retired,13,1,0.928571
programmer,60,6,0.909091
executive,29,3,0.90625
scientist,28,3,0.903226
entertainment,16,2,0.888889
lawyer,10,2,0.833333
salesman,9,3,0.75


In [6]:
g = users[['occupation', 'gender']].copy()
g['gender'] = g.gender.apply(lambda x: int(x == 'M'))
g = g.groupby('occupation').gender.sum() / g.occupation.value_counts() * 100
g = g.sort_values(ascending=False)
g

doctor           100.000000
engineer          97.014925
technician        96.296296
retired           92.857143
programmer        90.909091
executive         90.625000
scientist         90.322581
entertainment     88.888889
lawyer            83.333333
salesman          75.000000
educator          72.631579
student           69.387755
other             65.714286
marketing         61.538462
writer            57.777778
none              55.555556
administrator     54.430380
artist            53.571429
librarian         43.137255
healthcare        31.250000
homemaker         14.285714
dtype: float64

### Step 6. For each occupation, calculate the minimum and maximum ages

In [7]:
g = users[['occupation', 'age']].groupby('occupation').agg([min, max])
g

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,min,max
occupation,Unnamed: 1_level_2,Unnamed: 2_level_2
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [8]:
g = users[['occupation', 'gender', 'age']].groupby(['occupation', 'gender']).mean()
g

Unnamed: 0_level_0,Unnamed: 1_level_0,age
occupation,gender,Unnamed: 2_level_1
administrator,F,40.638889
administrator,M,37.162791
artist,F,30.307692
artist,M,32.333333
doctor,M,43.571429
educator,F,39.115385
educator,M,43.101449
engineer,F,29.5
engineer,M,36.6
entertainment,F,31.0


### Step 8.  For each occupation present the percentage of women and men

In [9]:
g = users[['occupation', 'gender']]
g = g.groupby('occupation').agg({
    'gender': {
        'male': lambda x: np.sum(x == 'M') / x.shape[0], 
        'female': lambda x: np.sum(x == 'F') / x.shape[0],
    }
})
g

Unnamed: 0_level_0,gender,gender
Unnamed: 0_level_1,male,female
occupation,Unnamed: 1_level_2,Unnamed: 2_level_2
administrator,0.544304,0.455696
artist,0.535714,0.464286
doctor,1.0,0.0
educator,0.726316,0.273684
engineer,0.970149,0.029851
entertainment,0.888889,0.111111
executive,0.90625,0.09375
healthcare,0.3125,0.6875
homemaker,0.142857,0.857143
lawyer,0.833333,0.166667


In [10]:
g = users[['occupation', 'gender']]
g1 = g.groupby(['occupation', 'gender']).agg({'gender': 'count'})
g2 = g.groupby('occupation').agg({'gender': 'count'})
g3 = g1.div(g2) * 100
g3

Unnamed: 0_level_0,Unnamed: 1_level_0,gender
occupation,gender,Unnamed: 2_level_1
administrator,F,45.56962
administrator,M,54.43038
artist,F,46.428571
artist,M,53.571429
doctor,M,100.0
educator,F,27.368421
educator,M,72.631579
engineer,F,2.985075
engineer,M,97.014925
entertainment,F,11.111111
