In [15]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [16]:
def count_pipeline(X_train, X_test, y_train, y_test):
    lr = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(max_iter=1000)),
                ]).fit(X_train, y_train)
    
    test_pred = lr.predict(X_test)

    print(confusion_matrix(y_test, test_pred))
    return accuracy_score(y_test, test_pred)

In [17]:
df = pd.read_csv('../../data/nama-gender-split.csv')
X_train, X_test, y_train, y_test = train_test_split(df['nama'], df['gender'], test_size=0.3)
cv_fml = count_pipeline(X_train, X_test, y_train, y_test)

[[85408  4804]
 [10279 86782]]


In [18]:
df = pd.read_csv('../../data/nama-gender-split.csv')
df = df.dropna()
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'], df['gender'], test_size=0.3)
cv_f = count_pipeline(X_train, X_test, y_train, y_test)

[[40128  2063]
 [ 7936 39679]]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'], df['gender'], test_size=0.3)
cv_m = count_pipeline(X_train, X_test, y_train, y_test)

[[31765 10467]
 [ 3872 43702]]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_akhir'], df['gender'], test_size=0.3)
cv_l = count_pipeline(X_train, X_test, y_train, y_test)

[[28128 14085]
 [ 3694 43899]]


In [21]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_tengah'], sep=' '), df['gender'], test_size=0.3)
cv_fm = count_pipeline(X_train, X_test, y_train, y_test)

[[40299  1774]
 [ 3330 44403]]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_fl = count_pipeline(X_train, X_test, y_train, y_test)

[[39158  3174]
 [ 3101 44373]]


In [23]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_ml = count_pipeline(X_train, X_test, y_train, y_test)

[[36323  5827]
 [ 2409 45247]]


In [24]:
from tabulate import tabulate

data = [
    ['Full Name', cv_fml],
    ['First Name', cv_f],
    ['Middle Name', cv_m],
    ['Last Name', cv_l],
    ['First & Middle Name', cv_fm],
    ['First & Last Name', cv_fl],
    ['Middle & Last Name', cv_ml],
]

headers = ['Name Components', 'Pipeline']
print(tabulate(data, headers=headers))

Name Components        Pipeline
-------------------  ----------
Full Name              0.91946
First Name             0.88866
Middle Name            0.840334
Last Name              0.802029
First & Middle Name    0.943166
First & Last Name      0.930127
Middle & Last Name     0.908291
