In [15]:
import json
import subprocess
import re
import string
from collections import OrderedDict

import numpy as np
import pandas as pd

In [2]:
def normalize_name(name):
    replaces = OrderedDict((
        ('ee', 'i'),
        ('iy', 'i'),
        ('ey', 'ei'),
        ('ei', 'i'),
        ('q', 'gh'),
        ('oo', 'u'),
        ('ou', 'u'),
        ('u', 'o'),
        ('y', 'i'),
        ('0', 'o'),
        ('w', 'v'),
    ))
    name = '_'.join(re.findall('[a-z0]+', name.lower()))
    for from_str, to_str in replaces.items():
        name = name.replace(from_str, to_str)

    name = re.sub('({})'.format('|'.join('%s{2,}' % ch for ch in string.ascii_lowercase)), lambda x: x.group(1)[0], name)

    return name

In [3]:
finglish_to_gender = OrderedDict(pd.read_csv('data/final_with_e.csv').set_index('pronounce')['gender'].to_dict())
names = list(map(normalize_name, finglish_to_gender.keys()))

In [4]:
def transform_usernames(usernames):
    matrix = np.zeros((len(usernames), len(names)))
    normalized_usernames = list(map(normalize_name, usernames))
    # subprocess.call(['g++', '-std=c++14', 'calculate_matrix.cpp', '-o', 'calculate_matrix'])
    subprocess.call(['g++', '-std=c++14', 'calculate_max_substring.cpp', '-o', 'calculate_max_substring'])
    # p = subprocess.Popen(['./calculate_matrix'], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
    p = subprocess.Popen(['./calculate_max_substring'], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
    data_to_send = '\n'.join([
        f'{len(usernames)} {len(names)}',
        ' '.join(names),
        '\n'.join(normalized_usernames),
    ])
    stdout, _ = p.communicate(bytes(data_to_send, 'UTF-8'))
    result = str(stdout, 'UTF-8').split('\n')
    cnt = -1
    for i, username in enumerate(normalized_usernames):
        if not username:
            continue
        cnt += 1
        values = list(map(float, result[cnt].split()))
        assert len(values) == len(names), (len(values), len(names))

        for j, value in enumerate(values):
            matrix[i][j] = value if value > 0.8 else 0
    return matrix

In [5]:
def get_matrix_and_labels(data_path, return_csv=False, extra_usernames=None, extra_labels=None):
    if extra_labels is None:
        extra_labels = []
    if extra_usernames is None:
        extra_usernames = []
    labeled = pd.read_csv(data_path)
    # labeled = labeled[labeled['gender'] != 'e']
    # labeled['gender'] = labeled['gender'].transform(lambda x: 1 if x == 'm' else 0)
    usernames = labeled['username'].to_list()
    labels = np.array(labeled['gender'].to_list() + extra_labels)
    matrix = transform_usernames(usernames + extra_usernames)
    valid_indexes = 1 <= matrix.sum(axis=1)
    labeled = labeled[valid_indexes[:len(labeled)]]
    matrix, labels = matrix[valid_indexes], labels[valid_indexes]
    if return_csv:
        # return matrix, labels, labeled
        return matrix, labels, labeled
    else:
        return matrix, labels

In [6]:
matrix, labels, people_data = get_matrix_and_labels(
    'data/labeled_people.csv',
    return_csv=True,
    extra_usernames=list(finglish_to_gender.keys()),
    extra_labels=list(finglish_to_gender.values()),
)
valid_matrix, valid_labels, valid_people_data = get_matrix_and_labels('data/test_people.csv', return_csv=True)

0
0.114299
0.228598
0.342896
0.457195
0.571494
0.685793
0.800091
0.91439
0


In [7]:
from sklearn.svm import SVC

ridge_model = SVC(tol=1e-6, max_iter=-1)
ridge_model.fit(matrix, labels)

SVC(tol=1e-06)

In [8]:
ridge_predict = ridge_model.predict(valid_matrix)
for cnt, (a, (_, b)) in enumerate(zip(ridge_predict, valid_people_data.iterrows())):
    if a != b['gender']:
        print(b['username'], a, b['gender'])
        # transformed = valid_matrix[cnt]
        # for j, name in enumerate(names):
        #     if transformed[j] > 0:
        #         print(transformed[j], name, list(finglish_to_gender.keys())[j])

print('ridge prediction:', (ridge_predict == valid_people_data['gender'].to_list()).mean() * 100, '%')


mahshammarket f e
esmail_1361s f m
vahideh84 m f
butane_rastaghi m e
johnaerys m f
_niilas_ f e
yaghout_zhikaal.sh m e
jahanbakhsh9979 f m
adafatemeh f e
mortezashiri.marketing f m
nouribime f e
majidnamakinooshabadi f m
ghajarirestaurant f e
khosromehri1796 f m
kiyan_red864 m e
porotoenamirabbass m e
nana_food_00 f e
seyed__akef m e
hanima9494 f e
lil.ifood e f
yare_mehraban__ f e
dr.roghayeh.valipour m f
faraziin.fan.page e f
sorry__yaar____ f m
mezonshayesteh_ f e
arezoo_arayeshi f e
mrs_sr.77 m f
micro.atefeh.jafari f e
n.a.afshar m e
mahdiesmailzadeh5 f m
sirvani.m13 m f
realnimataheri f m
mehr_.bt f e
miakouroshcandy m f
ba_namee m e
a_r_sarem m e
kalimdor_ir m e
eskandar.1977 f m
mahdisafari530 f m
mehradraders_9706 f m
asalsabalan_meshkin f e
kazemkr7777 f m
foroozan_hassan f m
somayeh_.javid m f
khywmrth1266 m e
noroozy2343 m e
qais__khawarin m e
ansaripage e m
z.javanmard1339 f e
mehdinaghash1360 f m
zoya____eshghe___perspolis e f
atr_yadavar m e
ebisoli59 m f
telma_beauty f 

In [9]:
# predict_proba = ridge_model.predict_proba(valid_matrix)

In [10]:
# print(len(predict_proba))
# sure = [i for i, x in enumerate(predict_proba) if max(x) > 0.8]
# print(len(sure))
# print('ridge prediction:', (ridge_predict[sure] == np.array(valid_people_data['gender'].to_list())[sure]).mean() * 100, '%')

In [11]:
x = matrix.sum(axis=1)
print(matrix.shape, x.shape, x.mean(), np.median(x), np.var(x), len([y for y in x if not y]))
print(people_data.iloc[x.argmax()])

(6604, 2252) (6604,) 1.4025861176559662 1.0 1.0518792856647272 0
Unnamed: 0                                                    3730
id                                                      3046562345
profile_pic_url    https://api.kaftaar.ir/media/user_feed/dYqNa1Tz
username                                                mehranabou
gender                                                           f
Name: 3730, dtype: object


In [12]:
# e_matrix = matrix[people_data['gender'] == 'e']
# x = e_matrix.sum(axis=1)
# print(x.mean(), np.median(x), np.var(x), e_matrix.shape)
# values = sorted(x.tolist())
# for idx in x.argsort()[-400:]:
#     row = people_data[people_data['gender'] == 'e'].iloc[idx]
#     print(row['username'], ':')
#     transformed = e_matrix[idx]
#     for j, name in enumerate(names):
#         if transformed[j] > 0:
#             print(transformed[j], name)
#     # print(row)
#     from IPython.display import Image, display
#     display(Image(filename=f'data/pictures/{row["profile_pic_url"].split("/")[-1]}.jpg'))

In [31]:
picture_used_matrix, picture_used_labels, picture_used_data = get_matrix_and_labels('data/picture-used-data.csv', return_csv=True)

0


In [38]:
picture_used_predict = ridge_model.predict(picture_used_matrix)
for predict, label, (_, row) in zip(picture_used_predict, picture_used_labels, picture_used_data.iterrows()):
    print(predict, label, row['username'], 'ok!' if predict == label else 'wrong!')
print('difference:', (picture_used_predict == picture_used_labels).mean())

f f saba.yahyaiee ok!
m m am.irhosin5232 ok!
f f about.saghar ok!
m e s.ziamirhoseini wrong!
m m iman_arra ok!
m m mhdheydarii ok!
m m _mohsen_0 ok!
f e asanseminar wrong!
f f _hanie.j_ ok!
m m mahdi_ara ok!
e m barbad_mokia wrong!
m f mrs_zabihollahzade wrong!
f f bahar.jmohammadi ok!
m f hassanzadeh.coach wrong!
m m mr.emadii ok!
f f mahdie.eilaky ok!
m m alibagheriinsta ok!
f e danestaniaw wrong!
f e atieh.computer wrong!
f m bahaminst wrong!
f f manager_zahrra ok!
f f zahraasadii.ir ok!
e e eiffelcloth ok!
e f star_food_blogger wrong!
m m aradweb.ir ok!
m e mr.amoozsh wrong!
f e adminoruji wrong!
f e marzieh972 wrong!
f f _golemaryam_ ok!
m m nimakhosravi.ir ok!
m m mohammad.tghz ok!
f m monaz.me wrong!
f e h.sara04 wrong!
f m saleh_gh95 wrong!
f f niloofar_sekhavati ok!
m m hesambahman ok!
f f maryam_mhmdi11 ok!
f e solmaz_eftekhari wrong!
m m siavashrahbar ok!
m f nazli_akmali wrong!
f f leila12_e18 ok!
m m arashniiknam ok!
m m mostafa.bagheri71 ok!
f f mona_dolati ok!
m m nasrmo