In [20]:
#coding:utf-8

import math
import pandas as pd
from collections import defaultdict

# 读取train.txt
train = pd.read_csv('nameData/train.txt')
test = pd.read_csv('nameData/test.txt')
submit = pd.read_csv('nameData/sample_submit.csv')

In [21]:
train.head(5)

Unnamed: 0,id,name,gender
0,1,闳家,1
1,2,玉璎,0
2,3,于邺,1
3,4,越英,0
4,5,蕴萱,0


In [22]:
# 把数据分为男女两部分
names_female = train[train['gender'] == 0]
names_male = train[train['gender'] == 1]

# totals用来存放训练集中女生、男生的总数
totals = {'f': len(names_female), 'm': len(names_male)}

In [23]:
# 分别计算在所有女生（男生）的名字当中，某个字出现的频率。这一步相当于是计算 P(Xi|女生)和P(Xi|男生)
frequency_list_f = defaultdict(int)
for name in names_female['name']:
    for char in name:
        frequency_list_f[char] += 1. / totals['f']

frequency_list_m = defaultdict(int)
for name in names_male['name']:
    for char in name:
        frequency_list_m[char] += 1. / totals['m']

In [24]:
print(frequency_list_f['娟'])

0.004144009000562539


In [25]:
print(frequency_list_m['钢'])

0.0006299685015749209


In [26]:
# 考虑到预测集中可能会有汉字并没有出现在训练集中，所以我们需要对频率进行Laplace平滑
def LaplaceSmooth(char, frequency_list, total, alpha=1.0):
    count = frequency_list[char] * total
    distinct_chars = len(frequency_list)
    freq_smooth = (count + alpha ) / (total + distinct_chars * alpha)
    return freq_smooth

In [27]:
base_f = math.log(1 - train['gender'].mean())
base_f += sum([math.log(1 - frequency_list_f[char]) for char in frequency_list_f])

base_m = math.log(train['gender'].mean())
base_m += sum([math.log(1 - frequency_list_m[char]) for char in frequency_list_m])

bases = {'f': base_f, 'm': base_m}

In [28]:
def GetLogProb(char, frequency_list, total):
    freq_smooth = LaplaceSmooth(char, frequency_list, total)
    return math.log(freq_smooth) - math.log(1 - freq_smooth)

In [29]:
def ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f):
    logprob_m = bases['m']
    logprob_f = bases['f']
    for char in name:
        logprob_m += GetLogProb(char, frequency_list_m, totals['m'])
        logprob_f += GetLogProb(char, frequency_list_f, totals['f'])
    return {'male': logprob_m, 'female': logprob_f}

def GetGender(LogProbs):
    return LogProbs['male'] > LogProbs['female']

result = []
for name in test['name']:
    LogProbs = ComputeLogProb(name, bases, totals, frequency_list_m, frequency_list_f)
    gender = GetGender(LogProbs)
    result.append(int(gender))

submit['gender'] = result

submit.to_csv('my_NB_prediction.csv', index=False)

In [37]:
LogProbs = ComputeLogProb('薛婷婷', bases, totals, frequency_list_m, frequency_list_f)
gender = GetGender(LogProbs)
int(gender)

0

In [13]:
test['pred'] = result
test.head(5)

Unnamed: 0,id,name,pred
0,0,辰君,0
1,1,佳遥,0
2,2,淼剑,1
3,3,浩苳,1
4,4,俪妍,0
