# 利用朴素贝叶斯对名字进行性别预测

参考资料：https://blog.csdn.net/weixin_41370083/article/details/82839012

https://blog.csdn.net/u013719780/article/details/76910779

## 读取文件

In [2]:
train = pd.read_csv('./input/train.txt')
test = pd.read_csv('./input/test.txt')
submit = pd.read_csv('./input/sample_submit.csv')

In [6]:
train.head()

Unnamed: 0,id,name,gender
0,1,闳家,1
1,2,玉璎,0
2,3,于邺,1
3,4,越英,0
4,5,蕴萱,0


In [7]:
test.head()

Unnamed: 0,id,name
0,0,辰君
1,1,佳遥
2,2,淼剑
3,3,浩苳
4,4,俪妍


In [8]:
submit.head()

Unnamed: 0,id,gender
0,0,1
1,1,0
2,2,1
3,3,0
4,4,1


## 计算先验概率

In [3]:
# 把训练数据分为男女两部分
names_female = train[train['gender'] == 0] # 女性
names_male = train[train['gender'] == 1] # 男性

In [10]:
totals = {'f': len(names_female), 'm': len(names_male)}
totals

{'f': 53330, 'm': 66670}

## 统计词频，即计算似然

In [11]:
from collections import defaultdict

frequency_list_f = defaultdict(int)
for name in names_female['name']:
    for char in name:
        frequency_list_f[char] += 1. / totals['f']

frequency_list_m = defaultdict(int)
for name in names_male['name']:
    for char in name:
        frequency_list_m[char] += 1. / totals['m']

In [12]:
# 女性中每个字出现的频率
frequency_list_f

defaultdict(int,
            {'玉': 0.00879429964372766,
             '璎': 0.0036752297018563724,
             '越': 0.000993812113257079,
             '英': 0.004875304706544151,
             '蕴': 0.004125257828614293,
             '萱': 0.008944309019313632,
             '子': 0.009731858241139982,
             '颀': 0.00026251640727545475,
             '靖': 0.0025314082130133125,
             '曦': 0.004387774235889743,
             '凤': 0.004331520720045004,
             '兰': 0.006281642602662635,
             '垚': 0.0016313519594974657,
             '佳': 0.011194449653103206,
             '明': 0.0042377648603037716,
             '芳': 0.007144196512281971,
             '巧': 0.0035064691543221504,
             '彤': 0.010125632852053158,
             '傲': 0.000656291018188637,
             '菲': 0.006619163697731071,
             '曾': 0.00043127695480967574,
             '年': 0.00026251640727545475,
             '蕙': 0.0011250703168948057,
             '芯': 0.00609413088318017,
             

In [13]:
# 男性中每个字出现的频率
frequency_list_m

defaultdict(int,
            {'闳': 0.0010649467526623667,
             '家': 0.007334633268336643,
             '于': 0.0012749362531873417,
             '邺': 0.0004649767511624417,
             '鲁': 0.0006899655017249133,
             '莱': 0.0002099895005249738,
             '永': 0.004994750262486887,
             '远': 0.0037648117594120197,
             '红': 0.0019499025048747615,
             '孙': 0.00017999100044997752,
             '增': 0.0013349332533373346,
             '景': 0.0038098095095245137,
             '棋': 0.0015149242537873132,
             '寇': 5.99970001499925e-05,
             '涛': 0.00439478026098695,
             '桦': 0.0014099295035248257,
             '琙': 7.499625018749062e-05,
             '闰': 0.0003149842507874606,
             '芷': 0.0007049647517624114,
             '锋': 0.0027898605069746497,
             '连': 0.0015899205039748043,
             '庆': 0.004439778011099445,
             '心': 0.0019649017549122597,
             '健': 0.0029548522573871277,
    

## 拉普拉斯平滑

参考资料：http://sofasofa.io/forum_main_post.php?postid=1001239

In [15]:
def LaplaceSmooth(char, frequency_list, total, alpha=1.0):
    count = frequency_list[char] * total
    distinct_chars = len(frequency_list)
    freq_smooth = (count + alpha) / (total + distinct_chars * alpha)
    return freq_smooth

## 根据贝叶斯公式计算概率（未未归一化）

In [17]:
import math
base_f = math.log(1 - train['gender'].mean())
base_f += sum(
    [math.log(1 - frequency_list_f[char]) for char in frequency_list_f])

base_m = math.log(train['gender'].mean())
base_m += sum(
    [math.log(1 - frequency_list_m[char]) for char in frequency_list_m])

bases = {'f': base_f, 'm': base_m}

In [18]:
def GetLogProb(char, frequency_list, total):
    freq_smooth = LaplaceSmooth(char, frequency_list, total)
    return math.log(freq_smooth) - math.log(1 - freq_smooth)

In [19]:
def ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f):
    logprob_m = bases['m']
    logprob_f = bases['f']
    for char in name:
        logprob_m += GetLogProb(char, frequency_list_m, totals['m'])
        logprob_f += GetLogProb(char, frequency_list_f, totals['f'])
    return {'male': logprob_m, 'female': logprob_f}


def GetGender(LogProbs):
    return LogProbs['male'] > LogProbs['female']


result = []
for name in test['name']:
    LogProbs = ComputeLogProb(name, bases, totals, frequency_list_m,
                              frequency_list_f)
    gender = GetGender(LogProbs)
    result.append(int(gender))

submit['gender'] = result

submit.to_csv('my_NB_prediction.csv', index=False)

In [20]:
test['pred'] = result
test.head(20)

Unnamed: 0,id,name,pred
0,0,辰君,0
1,1,佳遥,0
2,2,淼剑,1
3,3,浩苳,1
4,4,俪妍,0
5,5,秉毅,1
6,6,妍艺,0
7,7,海防,1
8,8,壬尧,1
9,9,珞千,0
