In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [2]:
def count_pipeline(X_train, X_test, y_train, y_test):
    lr = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
                ]).fit(X_train, y_train)
    
    test_pred = lr.predict(X_test)

    print(confusion_matrix(y_test, test_pred))
    return accuracy_score(y_test, test_pred)

In [3]:
df = pd.read_csv('../../data/nama-gender-split.csv')
X_train, X_test, y_train, y_test = train_test_split(df['nama'], df['gender'], test_size=0.3)
cv_fml = count_pipeline(X_train, X_test, y_train, y_test)

[[79211 11166]
 [ 3188 93708]]


In [11]:
df = pd.read_csv('../../data/nama-gender-split.csv')
df = df.dropna()
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'], df['gender'], test_size=0.3)
cv_f = count_pipeline(X_train, X_test, y_train, y_test)

[[33987  8083]
 [ 1965 45771]]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'], df['gender'], test_size=0.3)
cv_m = count_pipeline(X_train, X_test, y_train, y_test)

[[31682 10370]
 [ 3933 43821]]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_akhir'], df['gender'], test_size=0.3)
cv_l = count_pipeline(X_train, X_test, y_train, y_test)

[[27982 13957]
 [ 3712 44155]]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_tengah'], sep=' '), df['gender'], test_size=0.3)
cv_fm = count_pipeline(X_train, X_test, y_train, y_test)

[[39274  2928]
 [ 1486 46118]]


In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_fl = count_pipeline(X_train, X_test, y_train, y_test)

[[38157  4032]
 [ 1569 46048]]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_ml = count_pipeline(X_train, X_test, y_train, y_test)

[[36917  5152]
 [ 2435 45302]]


In [17]:
from tabulate import tabulate

data = [
    ['Full Name', cv_fml],
    ['First Name', cv_f],
    ['Middle Name', cv_m],
    ['Last Name', cv_l],
    ['First & Middle Name', cv_fm],
    ['First & Last Name', cv_fl],
    ['Middle & Last Name', cv_ml],
]

headers = ['Name Components', 'Pipeline']
print(tabulate(data, headers=headers))

Name Components        Pipeline
-------------------  ----------
Full Name              0.923865
First Name             0.888114
Middle Name            0.840734
Last Name              0.803254
First & Middle Name    0.95085
First & Last Name      0.937632
Middle & Last Name     0.915518
