In [2]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn import preprocessing

In [3]:
data = pd.read_csv('../../data/contoh.csv', encoding = 'unicode_escape')
data.sample(5)

Unnamed: 0,name,gender
174141,eli isnawati,f
145033,deni hermani,m
236022,henys,f
268865,jio paskah tampubolon,m
127336,cantika wati,f


In [4]:
data.isna().sum()

name      0
gender    0
dtype: int64

In [5]:
data['gender'].value_counts()

gender
m    322538
f    301703
Name: count, dtype: int64

In [6]:
data['gender'] = data['gender'].replace({'M' : 1, 'F' : 0})

encoder = preprocessing.LabelEncoder()
encoder.fit(data['name'])
data['name'] = pd.DataFrame(encoder.transform(data['name']))
data.sample(5)

Unnamed: 0,name,gender
257135,249456,f
617978,615199,f
485873,481637,m
345657,339273,m
315899,309014,m


In [7]:
X = data.drop('gender', axis = 1)
Y = data['gender']

In [8]:
X

Unnamed: 0,name
0,250
1,251
2,252
3,253
4,254
...,...
624236,621530
624237,621531
624238,621532
624239,621533


In [9]:
Y

0         f
1         f
2         f
3         m
4         m
         ..
624236    f
624237    f
624238    f
624239    f
624240    f
Name: gender, Length: 624241, dtype: object

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.2, random_state = 0)

In [11]:
print(X.shape, X_train.shape, X_test.shape)

(624241, 1) (124848, 1) (499393, 1)


In [12]:
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [13]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import lightgbm as lgb
from sklearn.svm import SVC

models = {
    'Logistic Regression' : LogisticRegression(),
    'Ridge Regression' : RidgeClassifier(),
    'Multinominal Naive Bayes' : MultinomialNB(), 
    'Bernoulli Naive Bayes' : BernoulliNB(), 
    'Gaussian Naive Bayes' : GaussianNB(), 
    'KNN' : KNeighborsClassifier(n_neighbors=100), 
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(),
    'Extra Trees' : ExtraTreesClassifier(),
    'Gradient Boost' : GradientBoostingClassifier(), 
    'Ada Boost' : AdaBoostClassifier(), 
    'LightGBM' : lgb.LGBMClassifier(), 
    'Linear Discriminant Analysis' : LinearDiscriminantAnalysis(), 
    'Quadratic Discriminant Analysis' : QuadraticDiscriminantAnalysis(), 
    'SVM': SVC(), 
    'MLP (Neural Network)' : MLPClassifier(),
}

for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + ' trained!')

Logistic Regression trained!
Ridge Regression trained!
Multinominal Naive Bayes trained!
Bernoulli Naive Bayes trained!
Gaussian Naive Bayes trained!
KNN trained!
Decision Tree trained!
Random Forest trained!
Extra Trees trained!
Gradient Boost trained!
Ada Boost trained!
LightGBM trained!
Linear Discriminant Analysis trained!
Quadratic Discriminant Analysis trained!
SVM trained!


In [None]:
from sklearn import metrics

print('Dataset Test')
for name, model in models.items():
    score = model.score(X_test, Y_test)
    print(name + ' Accuracy is: {:.2f}%'.format(score * 100))
    print(metrics.classification_report(X_test, Y_test, digits=5, labels=[1,0]))
    
print('Dataset Train')
for name, model in models.items():
    score = model.score(X_train, Y_train)
    print(name + ' Accuracy is: {:.2f}%'.format(score * 100))
    print(metrics.classification_report(X_train, Y_train, digits=5, labels=[1,0]))

Logistic Regression Accuracy is: 48.32%
Ridge Regression Accuracy is: 57.22%
Multinominal Naive Bayes Accuracy is: 51.68%
Bernoulli Naive Bayes Accuracy is: 51.68%
Gaussian Naive Bayes Accuracy is: 58.06%
KNN Accuracy is: 78.54%
Decision Tree Accuracy is: 84.39%
Random Forest Accuracy is: 84.40%
Extra Trees Accuracy is: 84.38%
Gradient Boost Accuracy is: 71.26%
Ada Boost Accuracy is: 63.58%
LightGBM Accuracy is: 70.09%
Linear Discriminant Analysis Accuracy is: 57.21%
Quadratic Discriminant Analysis Accuracy is: 58.06%
MLP (Neural Network) Accuracy is: 49.71%
