In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [2]:
# Loading Dataset
df = pd.read_csv("./data/contoh.csv", encoding = 'utf-8-sig')
df.shape

(624241, 2)

In [3]:
# mengecek apakah ada data yang berisi null
df.isnull().values.any()

# mengecek jumlah baris data yang berisi null
len(df[pd.isnull(df).any(axis=1)])

# menghapus baris null dan recheck kembali
df = df.dropna(how='all')
len(df[pd.isnull(df).any(axis=1)])

# mengecek dimensi dataset
df.shape

(624241, 2)

In [4]:
# mengubah isi kolom jenis kelamin dari text menjadi integer (m = 1; p= 0)
map = {"m" : 1, "f" : 0}
df["gender"] = df["gender"].map(map)

df.head(5)

Unnamed: 0,name,gender
0,a'adila yasmin humairah,0
1,a'aliyah ananda rusdi,0
2,a'am,0
3,a'an darmawan,1
4,a'an dwi handika ramadhan,1


In [5]:
# Mengecek distribusi jenis kelamin pada dataset
num_obs = len(df)
num_true = len(df.loc[df['gender'] == 1])
num_false = len(df.loc[df['gender'] == 0])
print("Jumlah Pria:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Jumlah Wanita: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Jumlah Pria:  322538 (51.67%)
Jumlah Wanita: 301703 (48.33%)


#### Split Dataset
Dataset yang adalah akan dipecah menjadi dua bagian, 70% data akan digunakan sebagai data training untuk melatih mesin. Kemudian 30% sisanya akan digunakan sebagai data testing untuk mengevaluasi akurasi predisksi machine learning.

In [6]:
from sklearn.model_selection import train_test_split

feature_col_names = ["name"]
predicted_class_names = ["gender"]

X = df[feature_col_names].values     
y = df[predicted_class_names].values
split_test_size = 0.30

text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, stratify=y, random_state=42) 

print("Dataset Asli Pria       : {0} ({1:0.2f}%)".format(len(df.loc[df['gender'] == 1]), (len(df.loc[df['gender'] == 1])/len(df.index)) * 100.0))
print("Dataset Asli Wanita     : {0} ({1:0.2f}%)".format(len(df.loc[df['gender'] == 0]), (len(df.loc[df['gender'] == 0])/len(df.index)) * 100.0))
print("")
print("Dataset Training Pria   : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Dataset Training Wanita : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Dataset Test Pria       : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Dataset Test Wanita     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Dataset Asli Pria       : 322538 (51.67%)
Dataset Asli Wanita     : 301703 (48.33%)

Dataset Training Pria   : 225776 (51.67%)
Dataset Training Wanita : 211192 (48.33%)

Dataset Test Pria       : 96762 (51.67%)
Dataset Test Wanita     : 90511 (48.33%)


In [8]:
# Menyimpan data uji ke dalam file pickle
with open('./models/gender-uji-data.pkl', 'wb') as file:
    pickle.dump(text_test, file)

# Menyimpan label sebenarnya dari data uji ke dalam file pickle
with open('./models/gender-uji-label.pkl', 'wb') as file:
    pickle.dump(y_test, file)

Terlihat hasilnya, dataset yang telah dipecah dua tetap dapat mempertahankan persentase distribusi jenis kelamin seperti pada dataset asli.

### Features Extraction
Proses features extraction, berpengaruh terhadap hasil akurasi yang didapatkan nantinya. Disini saya kan menggunakan metode simple yaitu CountVectorizer yang akan membuat matrix frekwensi kemunculan dari suatu karakter di tiap nama yang diberikan, dengan opsi analisa ngram_range 2 - 6 hanya di dalam satu kata saja. Misal Muhammad Irfani Sahnur, menghasilkan n-gram :
- mu
- ham
- mad
- nur
- dst

Count Vectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2,6))
vectorizer.fit(text_train.ravel())

with open('./models/vectorizer/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

X_train = vectorizer.transform(text_train.ravel())
X_test = vectorizer.transform(text_test.ravel())

with open('./models/vectorizer/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

with open('./models/vectorizer/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)

TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,6))
vectorizer_tfidf.fit(text_train.ravel())

with open('./models/vectorizer/vectorizer_tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer_tfidf, f)

X_train_tfidf = vectorizer_tfidf.transform(text_train.ravel())
X_test_tfidf = vectorizer_tfidf.transform(text_test.ravel())

with open('./models/vectorizer/X_train_tfidf.pkl', 'wb') as f:
    pickle.dump(X_train_tfidf, f)

with open('./models/vectorizer/X_test_tfidf.pkl', 'wb') as f:
    pickle.dump(X_test_tfidf, f)

In [7]:
import pickle

with open('./models/vectorizer/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
    
with open('./models/vectorizer/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

with open('./models/vectorizer/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
with open('./models/vectorizer/vectorizer_tfidf.pkl', 'rb') as f:
    vectorizer_tfidf = pickle.load(f)
    
with open('./models/vectorizer/X_train_tfidf.pkl', 'rb') as f:
    X_train_tfidf = pickle.load(f)

with open('./models/vectorizer/X_test_tfidf.pkl', 'rb') as f:
    X_test_tfidf = pickle.load(f)

#### **Random Forest**

#### **1. Decision Trees**

##### **1.1 Decision Tree**

In [11]:
%%time
from sklearn.tree import DecisionTreeClassifier

clf_dt =  DecisionTreeClassifier().fit(X_train, y_train.ravel())
with open('./models/dt-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_dt, f)

dt_pred = clf_dt.predict(X_test)
print(metrics.confusion_matrix(y_test, dt_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, dt_pred, digits=5, labels=[1,0]))

[[89038  7724]
 [ 7346 83165]]

Classification Report
              precision    recall  f1-score   support

           1    0.92378   0.92018   0.92198     96762
           0    0.91502   0.91884   0.91692     90511

    accuracy                        0.91953    187273
   macro avg    0.91940   0.91951   0.91945    187273
weighted avg    0.91955   0.91953   0.91953    187273

CPU times: user 9min 28s, sys: 3.23 s, total: 9min 31s
Wall time: 9min 31s


In [18]:
%%time
from sklearn.tree import DecisionTreeClassifier

clf_dt_tfidf =  DecisionTreeClassifier().fit(X_train_tfidf, y_train.ravel())
with open('./models/dt-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_dt_tfidf, f)

dt_pred_tfidf = clf_dt_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, dt_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, dt_pred_tfidf, digits=5, labels=[1,0]))

[[88721  8041]
 [ 7911 82600]]

Classification Report
              precision    recall  f1-score   support

           1    0.91813   0.91690   0.91752     96762
           0    0.91129   0.91260   0.91194     90511

    accuracy                        0.91482    187273
   macro avg    0.91471   0.91475   0.91473    187273
weighted avg    0.91482   0.91482   0.91482    187273

CPU times: user 25min 51s, sys: 7.35 s, total: 25min 58s
Wall time: 14min 2s


##### **1.2 Random forest**

In [15]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf_rf =  RandomForestClassifier(n_estimators=90, n_jobs=-1).fit(X_train, y_train.ravel())
with open('./models/rf-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_rf, f)

rf_pred = clf_rf.predict(X_test)
print(metrics.confusion_matrix(y_test, rf_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_pred, digits=5, labels=[1,0]))

[[93045  3717]
 [ 5086 85425]]

Classification Report
              precision    recall  f1-score   support

           1    0.94817   0.96159   0.95483     96762
           0    0.95830   0.94381   0.95100     90511

    accuracy                        0.95299    187273
   macro avg    0.95324   0.95270   0.95292    187273
weighted avg    0.95307   0.95299   0.95298    187273

CPU times: user 3h 22min 44s, sys: 47.5 s, total: 3h 23min 31s
Wall time: 22min 52s


In [19]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf_rf_tfidf =  RandomForestClassifier(n_estimators=90, n_jobs=-1).fit(X_train_tfidf, y_train.ravel())
with open('./models/rf-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_rf_tfidf, f)

rf_pred_tfidf = clf_rf_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, rf_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, rf_pred_tfidf, digits=5, labels=[1,0]))

[[92967  3795]
 [ 5064 85447]]

Classification Report
              precision    recall  f1-score   support

           1    0.94834   0.96078   0.95452     96762
           0    0.95748   0.94405   0.95072     90511

    accuracy                        0.95269    187273
   macro avg    0.95291   0.95242   0.95262    187273
weighted avg    0.95276   0.95269   0.95268    187273

CPU times: user 2h 51min 31s, sys: 50.4 s, total: 2h 52min 21s
Wall time: 20min 17s


##### **1.3 Extra Trees**

In [22]:
%%time
from sklearn.ensemble import ExtraTreesClassifier

clf_et =  ExtraTreesClassifier().fit(X_train, y_train.ravel())
with open('./models/et-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_et, f)

et_pred = clf_et.predict(X_test)
print(metrics.confusion_matrix(y_test, et_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, et_pred, digits=5, labels=[1,0]))

[[93113  3649]
 [ 5055 85456]]

Classification Report
              precision    recall  f1-score   support

           1    0.94851   0.96229   0.95535     96762
           0    0.95905   0.94415   0.95154     90511

    accuracy                        0.95352    187273
   macro avg    0.95378   0.95322   0.95344    187273
weighted avg    0.95360   0.95352   0.95351    187273

CPU times: user 4h 38min 34s, sys: 1min 48s, total: 4h 40min 22s
Wall time: 4h 41min 10s


In [23]:
%%time
from sklearn.ensemble import ExtraTreesClassifier

clf_et_tfidf =  ExtraTreesClassifier().fit(X_train_tfidf, y_train.ravel())
with open('./models/et-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_et_tfidf, f)

et_pred_tfidf = clf_et_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, et_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, et_pred_tfidf, digits=5, labels=[1,0]))

[[93235  3527]
 [ 4874 85637]]

Classification Report
              precision    recall  f1-score   support

           1    0.95032   0.96355   0.95689     96762
           0    0.96044   0.94615   0.95324     90511

    accuracy                        0.95514    187273
   macro avg    0.95538   0.95485   0.95507    187273
weighted avg    0.95521   0.95514   0.95513    187273

CPU times: user 4h 49min 38s, sys: 2min 18s, total: 4h 51min 56s
Wall time: 4h 53min 34s


#### **2. Boosting Algorithm**

##### **2.1 Gradient Boosting**

In [55]:
%%time
from sklearn.ensemble import GradientBoostingClassifier

clf_gb =  GradientBoostingClassifier().fit(X_train, y_train.ravel())
with open('./models/gb-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_gb, f)

gb_pred = clf_gb.predict(X_test)
print(metrics.confusion_matrix(y_test, gb_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, gb_pred, digits=5, labels=[1,0]))

[[87289  9473]
 [ 9194 81317]]

Classification Report
              precision    recall  f1-score   support

           1    0.90471   0.90210   0.90340     96762
           0    0.89566   0.89842   0.89704     90511

    accuracy                        0.90032    187273
   macro avg    0.90018   0.90026   0.90022    187273
weighted avg    0.90034   0.90032   0.90033    187273

CPU times: user 3min 59s, sys: 3.97 s, total: 4min 3s
Wall time: 4min 3s


In [56]:
%%time
from sklearn.ensemble import GradientBoostingClassifier

clf_gb_tfidf =  GradientBoostingClassifier().fit(X_train_tfidf, y_train.ravel())
with open('./models/gb-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_gb_tfidf, f)

gb_pred_tfidf = clf_gb_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, gb_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, gb_pred_tfidf, digits=5, labels=[1,0]))

[[87071  9691]
 [ 9098 81413]]

Classification Report
              precision    recall  f1-score   support

           1    0.90540   0.89985   0.90261     96762
           0    0.89363   0.89948   0.89654     90511

    accuracy                        0.89967    187273
   macro avg    0.89951   0.89966   0.89958    187273
weighted avg    0.89971   0.89967   0.89968    187273

CPU times: user 12min 49s, sys: 5.14 s, total: 12min 54s
Wall time: 12min 55s


 ##### **2.2 LightGBM**

In [None]:
# https://colab.research.google.com/github/novay/python/blob/main/classification/lightgbm_classifier_example.ipynb

##### **2.3 Ada Boosting**

In [None]:
# https://colab.research.google.com/github/novay/python/blob/main/classification/adaboosts_classifier_example.ipynb

#### **3. SVM (Support Vector Machine)**

#### **4. Naive Bayes**

##### **4.1 Multinominal**

In [45]:
%%time
from sklearn.naive_bayes import MultinomialNB

clf_nbm =  MultinomialNB().fit(X_train, y_train.ravel())
with open('./models/nbm-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_nbm, f)

nbm_pred = clf_nbm.predict(X_test)
print(metrics.confusion_matrix(y_test, nbm_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, nbm_pred, digits=5, labels=[1,0]))

# 0.4s

[[91362  5400]
 [ 3676 86835]]

Classification Report
              precision    recall  f1-score   support

           1    0.96132   0.94419   0.95268     96762
           0    0.94145   0.95939   0.95034     90511

    accuracy                        0.95154    187273
   macro avg    0.95139   0.95179   0.95151    187273
weighted avg    0.95172   0.95154   0.95155    187273

CPU times: user 379 ms, sys: 30.1 ms, total: 409 ms
Wall time: 408 ms


In [46]:
%%time

from sklearn.naive_bayes import MultinomialNB

clf_nbm =  MultinomialNB().fit(X_train_tfidf, y_train.ravel())
with open('./models/nbm-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_nbm, f)

nbm_pred = clf_nbm.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, nbm_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, nbm_pred, digits=5, labels=[1,0]))

# 0.4s

[[92284  4478]
 [ 4624 85887]]

Classification Report
              precision    recall  f1-score   support

           1    0.95228   0.95372   0.95300     96762
           0    0.95045   0.94891   0.94968     90511

    accuracy                        0.95140    187273
   macro avg    0.95137   0.95132   0.95134    187273
weighted avg    0.95140   0.95140   0.95140    187273

CPU times: user 387 ms, sys: 22.5 ms, total: 410 ms
Wall time: 408 ms


##### **4.2 Bernoulli**

In [47]:
%%time
from sklearn.naive_bayes import BernoulliNB

clf_nbb =  BernoulliNB().fit(X_train, y_train.ravel())
with open('./models/nbb-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_nbb, f)

nbb_pred = clf_nbb.predict(X_test)
print(metrics.confusion_matrix(y_test, nbb_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, nbb_pred, digits=5, labels=[1,0]))

# 0.5s

[[90857  5905]
 [ 3365 87146]]

Classification Report
              precision    recall  f1-score   support

           1    0.96429   0.93897   0.95146     96762
           0    0.93654   0.96282   0.94950     90511

    accuracy                        0.95050    187273
   macro avg    0.95041   0.95090   0.95048    187273
weighted avg    0.95088   0.95050   0.95051    187273

CPU times: user 476 ms, sys: 95.6 ms, total: 572 ms
Wall time: 585 ms


In [50]:
%%time
from sklearn.naive_bayes import BernoulliNB

clf_nbb =  BernoulliNB().fit(X_train_tfidf, y_train.ravel())
with open('./models/nbb-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_nbb, f)

nbb_pred = clf_nbb.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, nbb_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, nbb_pred, digits=5, labels=[1,0]))

# 0.5s

[[90857  5905]
 [ 3365 87146]]

Classification Report
              precision    recall  f1-score   support

           1    0.96429   0.93897   0.95146     96762
           0    0.93654   0.96282   0.94950     90511

    accuracy                        0.95050    187273
   macro avg    0.95041   0.95090   0.95048    187273
weighted avg    0.95088   0.95050   0.95051    187273

CPU times: user 471 ms, sys: 67.5 ms, total: 539 ms
Wall time: 538 ms


#### **5. KNN**

In [11]:
%%time
from sklearn.neighbors import KNeighborsClassifier

clf_knn =  KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train.ravel())
with open('./models/knn-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_knn, f)

knn_pred = clf_knn.predict(X_test)
print(metrics.confusion_matrix(y_test, knn_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, knn_pred, digits=5, labels=[1,0]))

[[91206  5556]
 [ 8848 81663]]

Classification Report
              precision    recall  f1-score   support

           1    0.91157   0.94258   0.92681     96762
           0    0.93630   0.90224   0.91896     90511

    accuracy                        0.92309    187273
   macro avg    0.92393   0.92241   0.92289    187273
weighted avg    0.92352   0.92309   0.92302    187273

CPU times: user 37min 46s, sys: 9min 9s, total: 46min 55s
Wall time: 50min 54s


In [8]:
%%time
from sklearn.neighbors import KNeighborsClassifier

clf_knn_tfidf =  KNeighborsClassifier(n_neighbors=5).fit(X_train_tfidf, y_train.ravel())
with open('./models/knn-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_knn_tfidf, f)

knn_pred_tfidf = clf_knn_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, knn_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, knn_pred_tfidf, digits=5, labels=[1,0]))

#### **6. Regression**

##### **6.1 Logistic Regression**

In [51]:
%%time
from sklearn.linear_model import LogisticRegression

clf_lr =  LogisticRegression(max_iter=2000).fit(X_train, y_train.ravel())
with open('./models/lr-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_lr, f)

lr_pred = clf_lr.predict(X_test)
print(metrics.confusion_matrix(y_test, lr_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_pred, digits=5, labels=[1,0]))

[[93727  3035]
 [ 3579 86932]]

Classification Report
              precision    recall  f1-score   support

           1    0.96322   0.96863   0.96592     96762
           0    0.96627   0.96046   0.96335     90511

    accuracy                        0.96468    187273
   macro avg    0.96474   0.96455   0.96464    187273
weighted avg    0.96469   0.96468   0.96468    187273

CPU times: user 49.5 s, sys: 3.41 s, total: 52.9 s
Wall time: 50.6 s


In [52]:
%%time
from sklearn.linear_model import LogisticRegression

clf_lr_tfidf =  LogisticRegression(max_iter=2000).fit(X_train_tfidf, y_train.ravel())
with open('./models/lr-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_lr_tfidf, f)

lr_pred_tfidf = clf_lr_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, lr_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_pred_tfidf, digits=5, labels=[1,0]))

# 19.7s

[[93687  3075]
 [ 3896 86615]]

Classification Report
              precision    recall  f1-score   support

           1    0.96008   0.96822   0.96413     96762
           0    0.96572   0.95696   0.96132     90511

    accuracy                        0.96278    187273
   macro avg    0.96290   0.96259   0.96272    187273
weighted avg    0.96280   0.96278   0.96277    187273

CPU times: user 18.9 s, sys: 2.87 s, total: 21.8 s
Wall time: 19.5 s


 ##### **6.1 Ridge Regression**

In [53]:
%%time
from sklearn.linear_model import RidgeClassifier

clf_ridge =  RidgeClassifier(max_iter=2000).fit(X_train, y_train.ravel())
with open('./models/ridge-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_ridge, f)

ridge_pred = clf_ridge.predict(X_test)
print(metrics.confusion_matrix(y_test, ridge_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, ridge_pred, digits=5, labels=[1,0]))

[[93158  3604]
 [ 5253 85258]]

Classification Report
              precision    recall  f1-score   support

           1    0.94662   0.96275   0.95462     96762
           0    0.95944   0.94196   0.95062     90511

    accuracy                        0.95271    187273
   macro avg    0.95303   0.95236   0.95262    187273
weighted avg    0.95282   0.95271   0.95269    187273

CPU times: user 16min 21s, sys: 11min 11s, total: 27min 33s
Wall time: 3min 6s


In [54]:
%%time
from sklearn.linear_model import RidgeClassifier

clf_ridge_tfidf =  RidgeClassifier(max_iter=2000).fit(X_train_tfidf, y_train.ravel())
with open('./models/ridge-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_ridge_tfidf, f)

ridge_pred_tfidf = clf_ridge_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, ridge_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, ridge_pred_tfidf, digits=5, labels=[1,0]))

# 

[[93821  2941]
 [ 4346 86165]]

Classification Report
              precision    recall  f1-score   support

           1    0.95573   0.96961   0.96262     96762
           0    0.96699   0.95198   0.95943     90511

    accuracy                        0.96109    187273
   macro avg    0.96136   0.96079   0.96102    187273
weighted avg    0.96117   0.96109   0.96108    187273

CPU times: user 1min 19s, sys: 51.4 s, total: 2min 10s
Wall time: 15 s


#### **7. Discriminant Analysis**

##### **7.1 Linear Discriminant Analysis**

In [1]:
%%time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pickle

with open('./models/vectorizer/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

clf_lda =  LinearDiscriminantAnalysis(n_components=1).fit(X_train.toarray(), y_train.ravel())
with open('./models/lda-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_lda, f)

with open('./models/vectorizer/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
lda_pred = clf_lda.predict(X_test.toarray())
print(metrics.confusion_matrix(y_test, lda_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lda_pred, digits=5, labels=[1,0]))

##### **7.2 Quadratic Discriminant Analysis**

In [None]:
%%time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf_lda =  LinearDiscriminantAnalysis().fit(X_train.toarray(), y_train.ravel())
with open('./models/lda-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_lda, f)

lda_pred = clf_lda.predict(X_test.toarray())
print(metrics.confusion_matrix(y_test, lda_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lda_pred, digits=5, labels=[1,0]))

MLP

In [24]:
%%time
from sklearn.neural_network import MLPClassifier

clf_mlp =  MLPClassifier().fit(X_train, y_train.ravel())
with open('./models/mlp-cv-models.pkl', 'wb') as f:
    pickle.dump(clf_mlp, f)

mlp_pred = clf_mlp.predict(X_test)
print(metrics.confusion_matrix(y_test, mlp_pred, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, mlp_pred, digits=5, labels=[1,0]))



[[93314  3448]
 [ 4083 86428]]

Classification Report
              precision    recall  f1-score   support

           1    0.95808   0.96437   0.96121     96762
           0    0.96164   0.95489   0.95825     90511

    accuracy                        0.95979    187273
   macro avg    0.95986   0.95963   0.95973    187273
weighted avg    0.95980   0.95979   0.95978    187273

CPU times: user 8h 9min 14s, sys: 16h 25min 57s, total: 1d 35min 11s
Wall time: 6h 20min 7s


In [None]:
%%time
from sklearn.neural_network import MLPClassifier

clf_mlp_tfidf =  MLPClassifier().fit(X_train_tfidf, y_train.ravel())
with open('./models/mlp-tfidf-models.pkl', 'wb') as f:
    pickle.dump(clf_mlp_tfidf, f)

mlp_pred_tfidf = clf_mlp_tfidf.predict(X_test_tfidf)
print(metrics.confusion_matrix(y_test, mlp_pred_tfidf, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, mlp_pred_tfidf, digits=5, labels=[1,0]))