### Libs

In [1]:
import pandas as pd
import numpy as np
import pickle

### Dataframe

In [2]:
dataframe = pd.read_csv('./data/dataframe.csv')

### Stratified Shuffle

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

### Definitions

In [4]:
y_df = dataframe['sexist']
y = y_df.astype(int)

In [5]:
from modules.select_features import select_features

In [6]:
X_char_qty = select_features(dataframe, column='char-qty')
X_word_qty = select_features(dataframe, column='word-qty')
X_likes = select_features(dataframe, column='likes')
X_dislikes = select_features(dataframe, column='dislikes')
X_likes_dislikes = select_features(dataframe, columns=['likes','dislikes'])
X_char_word_qty = select_features(dataframe, columns=['char-qty','word-qty'])
X_likes_dislike_char_words = select_features(dataframe, columns=['likes','dislikes','char-qty','word-qty'])
X_tf = select_features(dataframe, column='tf')
X_tf_100_sexist_words = select_features(dataframe, column='tf-sexist')
X_tf_100_not_sexist_words = select_features(dataframe, column='tf-no-sexist')

In [7]:
from modules.grid_search import grid_search

# SVM

In [8]:
from sklearn.svm import SVC

# svm_params = dict(gamma=np.logspace(-9, 3, 3), C=np.logspace(-2, 10, 3))
svm_params = dict(gamma=[1.0], C=[10.0])

In [9]:
svm_tf = grid_search(
    './data/svm-tf-grid-search-model',
    SVC(),
    svm_params,
    'SVM with TF',
    X_tf, y,
    reload=True
)
svm_tf_100_sexist_words = grid_search(
    './data/svm-tf_100_sexist_words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with TF for 100 sexist words',
    X_tf_100_sexist_words, y,
    # reload=True
)
svm_tf_100_not_sexist_words = grid_search(
    './data/svm-tf_100_not_sexist_words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with TF for 100 not sexist words',
    X_tf_100_not_sexist_words, y,
    reload=True
)
svm_char_qty = grid_search(
    './data/svm-char-qty-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Char quantity',
    X_char_qty, y,
    # reload=True
)
svm_word_qty = grid_search(
    './data/svm-word-qty-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Word quantity',
    X_word_qty, y,
    # reload=True
)
svm_likes_qty = grid_search(
    './data/svm-likes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes quantity',
    X_likes, y,
    # reload=True
)
svm_likes_qty = grid_search(
    './data/svm-likes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes quantity',
    X_likes, y,
    # reload=True
)
svm_dislikes_qty = grid_search(
    './data/svm-likes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Dislikes quantity',
    X_dislikes, y,
    # reload=True
)
svm_likes_dislikes_qty = grid_search(
    './data/svm-likes-dislikes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes and Dislikes quantity',
    X_likes_dislikes, y,
    # reload=True
)
svm_chars_words_qty = grid_search(
    './data/svm-chars-words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Chars and Words quantity',
    X_char_word_qty, y,
    # reload=True
)
svm_likes_dislikes_chars_words_qty = grid_search(
    './data/svm-likes-dislikes-chars-words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes, Dislikes, Chars and Words quantity',
    X_likes_dislike_char_words, y,
    # reload=True
)

Executing Grid Search to SVM with TF.
Model loaded in: @ 4 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.91
-------------------------------------------------------------------------------------------
Reading SVM with TF for 100 sexist words. Model
Model loaded in: @ 0 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.90
-------------------------------------------------------------------------------------------
Executing Grid Search to SVM with TF for 100 not sexist words.
Model loaded in: @ 1 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.84
-------------------------------------------------------------------------------------------
Reading SVM with Char quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.51
-------------------------------------------------------------------------------------------
Reading SVM with Word quantity. Model
Model l

# K-NN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = dict(n_neighbors=[3, 5, 11, 19],
                  weights=['uniform', 'distance'],
                  metric=['euclidean', 'manhattan'])

In [11]:
knn_tf = grid_search(
    './data/knn-tf-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with TF',
    X_tf, y,
    reload=True
)
knn_tf_100_sexist_words = grid_search(
    './data/knn-tf_100_sexist_words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with TF for 100 sexist words',
    X_tf_100_sexist_words, y,
    # reload=True
)
knn_tf_100_not_sexist_words = grid_search(
    './data/knn-tf_100_not_sexist_words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with TF for 100 not sexist words',
    X_tf_100_not_sexist_words, y,
    reload=True
)
knn_char_qty = grid_search(
    './data/knn-char-qty-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Char quantity',
    X_char_qty, y,
    # reload=True
)
knn_word_qty = grid_search(
    './data/knn-word-qty-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Word quantity',
    X_word_qty, y,
    # reload=True
)
knn_likes_qty = grid_search(
    './data/knn-likes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes quantity',
    X_likes, y,
    # reload=True
)
knn_likes_qty = grid_search(
    './data/knn-likes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes quantity',
    X_likes, y,
    # reload=True
)
knn_dislikes_qty = grid_search(
    './data/knn-likes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Dislikes quantity',
    X_dislikes, y,
    # reload=True
)
knn_likes_dislikes_qty = grid_search(
    './data/knn-likes-dislikes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes and Dislikes quantity',
    X_likes_dislikes, y,
    # reload=True
)
knn_chars_words_qty = grid_search(
    './data/knn-chars-words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Chars and Words quantity',
    X_char_word_qty, y,
    # reload=True
)
knn_likes_dislikes_chars_words_qty = grid_search(
    './data/knn-likes-dislikes-chars-words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes, Dislikes, Chars and Words quantity',
    X_likes_dislike_char_words, y,
    # reload=True
)

Executing Grid Search to KNN with TF.
Model loaded in: @ 7 seconds
The best parameters are {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'} with a score of 0.83
-------------------------------------------------------------------------------------------
Reading KNN with TF for 100 sexist words. Model
Model loaded in: @ 0 seconds
The best parameters are {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'} with a score of 0.84
-------------------------------------------------------------------------------------------
Executing Grid Search to KNN with TF for 100 not sexist words.
Model loaded in: @ 2 seconds
The best parameters are {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'} with a score of 0.78
-------------------------------------------------------------------------------------------
Reading KNN with Char quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'} wit

# Classification Report

In [12]:
from modules.classification_report import print_means_report, print_datailed_report

In [13]:
print_means_report(X_tf, y, svm_tf, 'svm-tf')
print_means_report(X_tf_100_sexist_words, y, svm_tf_100_sexist_words, 'svm-tf-100-sexist-words')
print_means_report(X_tf_100_not_sexist_words, y, svm_tf_100_not_sexist_words, 'svm-tf-100-not-sexist-words')
print_means_report(X_char_qty, y, svm_char_qty, 'svm-char-qty')
print_means_report(X_word_qty, y, svm_word_qty, 'svm-word-qty')
print_means_report(X_char_word_qty, y, svm_chars_words_qty, 'svm-char_word_qty')
print_means_report(X_likes, y, svm_likes_qty, 'svm-likes-qty')
print_means_report(X_dislikes, y, svm_dislikes_qty, 'svm-dislikes-qty')
print_means_report(X_likes_dislikes, y, svm_likes_dislikes_qty, 'svm-likes-dislikes-qty')
print_means_report(X_likes_dislike_char_words, y, svm_likes_dislikes_chars_words_qty, 'svm-likes-dislikes-chars-words-qty')

>>> svm-tf scores
Precisão média: 0.899344019194509
Revocação média: 0.8980055380611146
Média de F1: 0.895530983664752

>>> svm-tf-100-sexist-words scores
Precisão média: 0.9014523645402601
Revocação média: 0.8972609009951767
Média de F1: 0.8938931764250924

>>> svm-tf-100-not-sexist-words scores
Precisão média: 0.8572936962500561
Revocação média: 0.8418727461085126
Média de F1: 0.8425586758832397

>>> svm-char-qty scores
Precisão média: 0.49629032207222795
Revocação média: 0.49668904035938866
Média de F1: 0.4951008050468875

>>> svm-word-qty scores
Precisão média: 0.5186735006065082
Revocação média: 0.5177273831946816
Média de F1: 0.5145176351629778

>>> svm-char_word_qty scores
Precisão média: 0.5230695424259709
Revocação média: 0.5225619728486052
Média de F1: 0.5201991665967893

>>> svm-likes-qty scores
Precisão média: 0.5416916537395677
Revocação média: 0.5410233925486158
Média de F1: 0.5387134331730462

>>> svm-dislikes-qty scores
Precisão média: 0.5686664936141765
Revocação média

In [14]:
print_datailed_report(X_tf, y, svm_tf, 'svm-tf')
print_datailed_report(X_tf_100_sexist_words, y, svm_tf_100_sexist_words, 'svm-tf-100-sexist-words')
print_datailed_report(X_tf_100_not_sexist_words, y, svm_tf_100_not_sexist_words, 'svm-tf-100-not-sexist-words')
print_datailed_report(X_char_qty, y, svm_char_qty, 'svm-char-qty')
print_datailed_report(X_word_qty, y, svm_word_qty, 'svm-word-qty')
print_datailed_report(X_char_word_qty, y, svm_chars_words_qty, 'svm-char_word_qty')
print_datailed_report(X_likes, y, svm_likes_qty, 'svm-likes-qty')
print_datailed_report(X_dislikes, y, svm_dislikes_qty, 'svm-dislikes-qty')
print_datailed_report(X_likes_dislikes, y, svm_likes_dislikes_qty, 'svm-likes-dislikes-qty')
print_datailed_report(X_likes_dislike_char_words, y, svm_likes_dislikes_chars_words_qty, 'svm-likes-dislikes-chars-words-qty')

>>> svm-tf results
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       319
           1       1.00      0.93      0.96       349

    accuracy                           0.96       668
   macro avg       0.96      0.96      0.96       668
weighted avg       0.97      0.96      0.96       668

Confusion matrix
     T    F
F  324   25
T    0  319
>>> svm-tf-100-sexist-words results
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       319
           1       0.99      0.84      0.91       349

    accuracy                           0.91       668
   macro avg       0.92      0.92      0.91       668
weighted avg       0.93      0.91      0.91       668

Confusion matrix
     T    F
F  294   55
T    2  317
>>> svm-tf-100-not-sexist-words results
              precision    recall  f1-score   support

           0       0.96      0.80      0.87       319
           1       0.84      0.97   

In [15]:
print_means_report(X_tf, y, knn_tf, 'knn-tf')
print_means_report(X_tf_100_sexist_words, y, knn_tf_100_sexist_words, 'knn-tf-100-sexist-words')
print_means_report(X_tf_100_not_sexist_words, y, knn_tf_100_not_sexist_words, 'knn-tf-100-not-sexist-words')
print_means_report(X_char_qty, y, knn_char_qty, 'knn-char-qty')
print_means_report(X_word_qty, y, knn_word_qty, 'knn-word-qty')
print_means_report(X_char_word_qty, y, knn_chars_words_qty, 'knn-char_word_qty')
print_means_report(X_likes, y, knn_likes_qty, 'knn-likes-qty')
print_means_report(X_dislikes, y, knn_dislikes_qty, 'knn-dislikes-qty')
print_means_report(X_likes_dislikes, y, knn_likes_dislikes_qty, 'knn-likes-dislikes-qty')
print_means_report(X_likes_dislike_char_words, y, knn_likes_dislikes_chars_words_qty, 'knn-likes-dislikes-chars-words-qty')

>>> knn-tf scores
Precisão média: 0.8507240257145673
Revocação média: 0.841591493326711
Média de F1: 0.83688132837433

>>> knn-tf-100-sexist-words scores
Precisão média: 0.8481841736076949
Revocação média: 0.836828011578364
Média de F1: 0.8313584360336241

>>> knn-tf-100-not-sexist-words scores
Precisão média: 0.7944908248317231
Revocação média: 0.7874045424796272
Média de F1: 0.7847135135167592

>>> knn-char-qty scores
Precisão média: 0.49649911611517333
Revocação média: 0.4964299667549425
Média de F1: 0.4944943868637234

>>> knn-word-qty scores
Precisão média: 0.5080241329254928
Revocação média: 0.5080427465536008
Média de F1: 0.5062942453790011

>>> knn-char_word_qty scores
Precisão média: 0.5020939379969616
Revocação média: 0.5019276082930058
Média de F1: 0.500990216731499

>>> knn-likes-qty scores
Precisão média: 0.5178890413867661
Revocação média: 0.5165088763494267
Média de F1: 0.5083895021171858

>>> knn-dislikes-qty scores
Precisão média: 0.5684694203297052
Revocação média: 0.

In [16]:
print_datailed_report(X_tf, y, knn_tf, 'knn-tf')
print_datailed_report(X_tf_100_sexist_words, y, knn_tf_100_sexist_words, 'knn-tf-100-sexist-words')
print_datailed_report(X_tf_100_not_sexist_words, y, knn_tf_100_not_sexist_words, 'knn-tf-100-not-sexist-words')
print_datailed_report(X_char_qty, y, knn_char_qty, 'knn-char-qty')
print_datailed_report(X_word_qty, y, knn_word_qty, 'knn-word-qty')
print_datailed_report(X_char_word_qty, y, knn_chars_words_qty, 'knn-char_word_qty')
print_datailed_report(X_likes, y, knn_likes_qty, 'knn-likes-qty')
print_datailed_report(X_dislikes, y, knn_dislikes_qty, 'knn-dislikes-qty')
print_datailed_report(X_likes_dislikes, y, knn_likes_dislikes_qty, 'knn-likes-dislikes-qty')
print_datailed_report(X_likes_dislike_char_words, y, knn_likes_dislikes_chars_words_qty, 'knn-likes-dislikes-chars-words-qty')

>>> knn-tf results
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       319
           1       0.91      0.99      0.95       349

    accuracy                           0.94       668
   macro avg       0.95      0.94      0.94       668
weighted avg       0.95      0.94      0.94       668

Confusion matrix
     T    F
F  345    4
T   33  286
>>> knn-tf-100-sexist-words results
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       319
           1       0.98      0.88      0.93       349

    accuracy                           0.93       668
   macro avg       0.93      0.93      0.93       668
weighted avg       0.93      0.93      0.93       668

Confusion matrix
     T    F
F  306   43
T    5  314
>>> knn-tf-100-not-sexist-words results
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       319
           1       0.95      0.80   