In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
def count_vectorizer(X_train, X_test, y_train, y_test):
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5))
    train_features = vectorizer.fit_transform(X_train)
    test_features = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(train_features, y_train)

    test_pred = model.predict(test_features)

    print(confusion_matrix(y_test, test_pred))
    return accuracy_score(y_test, test_pred)

In [3]:
df = pd.read_csv('../../data/nama-gender-split.csv')
X_train, X_test, y_train, y_test = train_test_split(df['nama'], df['gender'], test_size=0.3)
cv_fml = count_vectorizer(X_train, X_test, y_train, y_test)

[[87010  3643]
 [ 3112 93508]]


In [4]:
df = pd.read_csv('../../data/nama-gender-split.csv')
df = df.dropna()
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'], df['gender'], test_size=0.3)
cv_f = count_vectorizer(X_train, X_test, y_train, y_test)

[[39001  3203]
 [ 2864 44738]]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'], df['gender'], test_size=0.3)
cv_m = count_vectorizer(X_train, X_test, y_train, y_test)

[[37069  5073]
 [ 5373 42291]]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_akhir'], df['gender'], test_size=0.3)
cv_l = count_vectorizer(X_train, X_test, y_train, y_test)

[[32588  9652]
 [ 4862 42704]]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_tengah'], sep=' '), df['gender'], test_size=0.3)
cv_fm = count_vectorizer(X_train, X_test, y_train, y_test)

[[40638  1654]
 [ 1494 46020]]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_fl = count_vectorizer(X_train, X_test, y_train, y_test)

[[40170  1869]
 [ 1656 46111]]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_ml = count_vectorizer(X_train, X_test, y_train, y_test)

[[38998  3083]
 [ 2544 45181]]


In [10]:
from tabulate import tabulate

data = [
    ['Full Name', cv_fml],
    ['First Name', cv_f],
    ['Middle Name', cv_m],
    ['Last Name', cv_l],
    ['First & Middle Name', cv_fm],
    ['First & Last Name', cv_fl],
    ['Middle & Last Name', cv_ml],
]

headers = ['Name Components', 'Count Vectorizer']
print(tabulate(data, headers=headers))

Name Components        Count Vectorizer
-------------------  ------------------
Full Name                      0.96393
First Name                     0.932443
Middle Name                    0.883683
Last Name                      0.838385
First & Middle Name            0.964947
First & Last Name              0.960749
Middle & Last Name             0.937343
