In [119]:
import subprocess
import re
import string
from collections import OrderedDict

import numpy as np
import pandas as pd

import svm
from sklearn import naive_bayes

In [138]:
def fit_naive_bayes_model(matrix, labels):
    matrix = matrix.T
    k, m = matrix.shape
    mat_1 = matrix * labels
    mat_0 = matrix * (1 - labels)
    phi_1 = (mat_1.sum(axis=1) + 1) / (mat_1.sum() + k)
    phi_0 = (mat_0.sum(axis=1) + 1) / (mat_0.sum() + k)
    phi_y = labels.mean()
    return phi_1, phi_0, phi_y

def predict_from_naive_bayes_model(model, matrix):
    matrix = matrix.T
    phi_1, phi_0, phi_y = model

    k = phi_1.shape[0]
    phi_1_log, phi_0_log, phi_y_log, phi_y_0_log = np.log(phi_1), np.log(phi_0), np.log(phi_y), np.log(1 - phi_y)
    return (matrix * phi_1_log.reshape(k, 1)).sum(axis=0) + phi_y_log > \
           (((matrix * phi_0_log.reshape(k, 1)).sum(axis=0) + phi_y_0_log) * 1.05)

In [121]:
def normalize_name(name):
    replaces = OrderedDict((
        ('iy', 'i'),
        ('ei', 'i'),
        ('ey', 'ei'),
        ('q', 'gh'),
        ('oo', 'u'),
        ('ou', 'u'),
        ('u', 'o'),
        ('y', 'i'),
        ('0', 'o'),
    ))
    name = '_'.join(re.findall('[a-z0]+', name.lower()))
    for from_str, to_str in replaces.items():
        name = name.replace(from_str, to_str)

    name = re.sub('({})'.format('|'.join('%s{2,}' % ch for ch in string.ascii_lowercase)), lambda x: x.group(1)[0], name)

    return name

In [122]:
finglish_to_gender = OrderedDict(pd.read_csv('data/final.csv').set_index('pronounce')['gender'].to_dict())
names = list(map(normalize_name, finglish_to_gender.keys()))
chars = string.ascii_lowercase

In [123]:
def transform_usernames(usernames):
    matrix = np.zeros((len(usernames), len(names)))
    normalized_usernames = list(map(normalize_name, usernames))
    subprocess.call(['g++', '-std=c++14', 'calculate_matrix.cpp', '-o', 'calculate_matrix'])
    p = subprocess.Popen(['./calculate_matrix'], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
    data_to_send = '\n'.join([
        f'{len(usernames)} {len(names)}',
        ' '.join(names),
        '\n'.join(normalized_usernames),
    ])
    stdout, _ = p.communicate(bytes(data_to_send, 'UTF-8'))
    result = str(stdout, 'UTF-8').split('\n')
    cnt = -1
    for i, username in enumerate(normalized_usernames):
        if not username:
            continue
        cnt += 1
        values = list(map(int, result[cnt].split()))
        for name, (j, value) in zip(names, enumerate(values)):
            assert (value == 0) == (name in username), (name, usernames[i], username, value, i)
            matrix[i][j] = (3 - value) if value <= 3 else 0

    return matrix

In [124]:
def get_matrix_and_labels(data_path, return_csv=False):
    labeled = pd.read_csv(data_path)
    labeled['gender'] = labeled['gender'].transform(lambda x: 1 if x == 'e' else 0)
    usernames = labeled['username'].to_list()
    labels = np.array(labeled['gender'].to_list())
    matrix = transform_usernames(usernames)
    if return_csv:
        return matrix, labels, labeled
    else:
        return matrix, labels

In [125]:
matrix, labels = get_matrix_and_labels('data/labeled_people.csv')
valid_matrix, valid_labels, valid_people_data = get_matrix_and_labels('data/test_people.csv', return_csv=True)

In [126]:
model = fit_naive_bayes_model(matrix, labels)

In [145]:
def test_model(model):
    predict_labels = predict_from_naive_bayes_model(model, valid_matrix)
    for (predict_label, valid_label), username, label in zip(zip(predict_labels, valid_labels), valid_people_data['username'].to_list(), valid_people_data['gender'].to_list()):
        if predict_label != valid_label:
            print(username, normalize_name(username), predict_label, valid_label)
    print((predict_labels == 1).sum())
    print((valid_labels == 1).sum())
    with open('data/certain_people.csv', 'w') as f:
        data = pd.read_csv('data/test_people.csv')
        f.write(data[predict_labels == 1].to_csv(index=False))
    return ((valid_labels == 0) | (predict_labels == 1)).mean()

In [146]:
print('naive bayes prediction:', test_model(model) * 100, '%')

mahshammarket mahshamarket False 1
asal_sly asal_sli True 0
setaesh.setareh setaesh_setareh True 0
food_network_by_mili fod_network_bi_mili False 1
gallery_atr_buckingham galeri_atr_bockingham False 1
butane_rastaghi botane_rastaghi False 1
majid.miranii majid_mirani True 0
offroad.shopp ofroad_shop False 1
alimobram____ alimobram True 0
hadi_ebrahimibastami hadi_ebrahimibastami True 0
amirmojiri4 amirmojiri True 0
bestboy.parishilton bestboi_parishilton False 1
yvesmalki ivesmalki True 0
giordano.matty giordano_mati True 0
mikeleathers mikeleathers False 1
peter_casio peter_casio False 1
ed.sherlock.7 ed_sherlock False 1
johnaerys johnaeris True 0
proudfelon prodfelon False 1
snowwhiteondablock snowhiteondablock True 0
nikki_tkv niki_tkv True 0
jamminirene jaminirene True 0
ashleebhypnotherapy ashlebhipnotherapi True 0
scaramellodeassis50 scaramelodeasis_o True 0
laylobrwn lailobrwn True 0
joseph_desanto_jr joseph_desanto_jr True 0
nafise68.nr nafise_nr True 0
mh_hosseynii mh_hoseini 