In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [2]:
def count_pipeline(X_train, X_test, y_train, y_test):
    tree = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', DecisionTreeClassifier())
                ]).fit(X_train, y_train)
    
    test_pred = tree.predict(X_test)

    print(confusion_matrix(y_test, test_pred))
    return accuracy_score(y_test, test_pred)

In [3]:
df = pd.read_csv('../../data/nama-gender-split.csv')
X_train, X_test, y_train, y_test = train_test_split(df['nama'], df['gender'], test_size=0.3)
cv_fml = count_pipeline(X_train, X_test, y_train, y_test)

In [4]:
df = pd.read_csv('../../data/nama-gender-split.csv')
df = df.dropna()
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'], df['gender'], test_size=0.3)
cv_f = count_pipeline(X_train, X_test, y_train, y_test)

[[34153  7923]
 [ 2056 45674]]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'], df['gender'], test_size=0.3)
cv_m = count_pipeline(X_train, X_test, y_train, y_test)

[[31644 10488]
 [ 3815 43859]]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_akhir'], df['gender'], test_size=0.3)
cv_l = count_pipeline(X_train, X_test, y_train, y_test)

[[28038 14178]
 [ 3411 44179]]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_tengah'], sep=' '), df['gender'], test_size=0.3)
cv_fm = count_pipeline(X_train, X_test, y_train, y_test)

[[39564  2797]
 [ 1761 45684]]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_awal'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_fl = count_pipeline(X_train, X_test, y_train, y_test)

[[38352  3752]
 [ 1784 45918]]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['nama_tengah'].str.cat(df['nama_akhir'], sep=' '), df['gender'], test_size=0.3)
cv_ml = count_pipeline(X_train, X_test, y_train, y_test)

[[37192  5130]
 [ 2502 44982]]


In [10]:
from tabulate import tabulate

data = [
    ['Full Name', cv_fml],
    ['First Name', cv_f],
    ['Middle Name', cv_m],
    ['Last Name', cv_l],
    ['First & Middle Name', cv_fm],
    ['First & Last Name', cv_fl],
    ['Middle & Last Name', cv_ml],
]

headers = ['Name Components', 'Pipeline']
print(tabulate(data, headers=headers))

Name Components        Pipeline
-------------------  ----------
Full Name              0.924874
First Name             0.888883
Middle Name            0.840734
Last Name              0.804144
First & Middle Name    0.949246
First & Last Name      0.938356
Middle & Last Name     0.915017
