In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
def count_tfidf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    test_pred = lr.predict(X_test)

    print(confusion_matrix(y_test, test_pred))
    return accuracy_score(y_test, test_pred)

In [4]:
df = pd.read_csv('../../data/nama-gender-split.csv')
tf_fml = count_tfidf(TfidfVectorizer().fit_transform(df['nama']), df['gender'])

[[85852  4949]
 [10180 86292]]


In [5]:
df = pd.read_csv('../../data/nama-gender-split.csv')
df = df.dropna()
tf_f = count_tfidf(TfidfVectorizer().fit_transform(df['nama_awal']), df['gender'])

[[40209  2037]
 [ 7981 39579]]


In [6]:
tf_m = count_tfidf(TfidfVectorizer().fit_transform(df['nama_tengah']), df['gender'])

[[32124 10122]
 [ 4169 43391]]


In [7]:
tf_l = count_tfidf(TfidfVectorizer().fit_transform(df['nama_akhir']), df['gender'])

[[28099 14147]
 [ 3463 44097]]


In [8]:
tf_fm = count_tfidf(TfidfVectorizer().fit_transform(df['nama_awal'].str.cat(df['nama_tengah'], sep=' ')), df['gender'])

[[40468  1778]
 [ 3227 44333]]


In [9]:
tf_fl = count_tfidf(TfidfVectorizer().fit_transform(df['nama_awal'].str.cat(df['nama_akhir'], sep=' ')), df['gender'])

[[39183  3063]
 [ 3342 44218]]


In [10]:
tf_ml = count_tfidf(TfidfVectorizer().fit_transform(df['nama_tengah'].str.cat(df['nama_akhir'], sep=' ')), df['gender'])

[[36380  5866]
 [ 2428 45132]]


In [14]:
from tabulate import tabulate

data = [
    ['Full Name', tf_fml],
    ['First Name', tf_f],
    ['Middle Name', tf_m],
    ['Last Name', tf_l],
    ['First & Middle Name', tf_fm],
    ['First & Last Name', tf_fl],
    ['Middle & Last Name', tf_ml],
]

headers = ['Name Components', 'TF-IDF']
print(tabulate(data, headers=headers))

Name Components        TF-IDF
-------------------  --------
Full Name            0.919214
First Name           0.888448
Middle Name          0.840868
Last Name            0.803911
First & Middle Name  0.944269
First & Last Name    0.92868
Middle & Last Name   0.907645
