### 白人和黑人在求职路上会有种族的歧视吗？

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
data = pd.io.stata.read_stata('us_job_market_discrimination.dta')

In [3]:
data.head()

Unnamed: 0,id,ad,education,ofjobs,yearsexp,honors,volunteer,military,empholes,occupspecific,...,compreq,orgreq,manuf,transcom,bankreal,trade,busservice,othservice,missind,ownership
0,b,1,4,2,6,0,0,0,1,17,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,b,1,3,3,6,0,1,1,0,316,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,b,1,4,1,6,0,0,0,0,19,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,b,1,3,4,6,0,1,0,1,313,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,b,1,3,3,22,0,0,0,0,313,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Nonprofit


In [4]:
blacks = data[data.race == 'b']
whites = data[data.race == 'w']

In [6]:
blacks.call.describe()

count    2435.000000
mean        0.064476
std         0.245649
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: call, dtype: float64

In [7]:
whites.call.describe()

count    2435.000000
mean        0.096509
std         0.295346
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: call, dtype: float64

## 卡方检验

* 白人获得职位
* 白人被拒绝
* 黑人获得职位
* 黑人被拒绝

## 假设检验
- H0：种族对求职结果没有显著影响
- H1：种族对求职结果有影响

In [10]:
blacks_called = len(blacks[blacks['call'] == True])
blacks_not_called = len(blacks[blacks['call'] == False])
whites_called = len(whites[whites['call'] == True])
whites_not_called = len(whites[whites['call'] == False])

In [11]:
observed = pd.DataFrame({'blacks': {'called': blacks_called, 'not_called': blacks_not_called},
                         'whites': {'called' : whites_called, 'not_called' : whites_not_called}})

In [12]:
observed


Unnamed: 0,blacks,whites
called,157,235
not_called,2278,2200


In [13]:
num_called_back = blacks_called + whites_called
num_not_called = blacks_not_called + whites_not_called

print(num_called_back)
print(num_not_called)

392
4478


In [28]:
rate_of_callbacks = num_called_back / (num_not_called + num_called_back)

得到期望的比率

In [29]:
rate_of_callbacks

0.08049281314168377

In [30]:
expected_called = len(data)  * rate_of_callbacks
expected_not_called = len(data)  * (1 - rate_of_callbacks)

In [31]:
print(expected_called)
print(expected_not_called)

391.99999999999994
4478.0


In [32]:
import scipy.stats as stats
observed_frequencies = [blacks_not_called, whites_not_called, whites_called, blacks_called]
expected_frequencies = [expected_not_called/2, expected_not_called/2, expected_called/2, expected_called/2]


stats.chisquare(f_obs = observed_frequencies,
                f_exp = expected_frequencies)

Power_divergenceResult(statistic=16.879050414270221, pvalue=0.00074839594410972638)

看起来种族歧视是存在的！