### Libs

In [1]:
import pandas as pd
import numpy as np
import pickle

### Dataframe

In [2]:
dataframe = pd.read_csv('./data/dataframe.csv')

### Stratified Shuffle

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

### Definitions

In [4]:
y_df = dataframe['sexist']
y = y_df.astype(int)

In [5]:
from modules.select_features import select_features

In [6]:
X_char_qty = select_features(dataframe, column='char-qty')
X_word_qty = select_features(dataframe, column='word-qty')
X_likes = select_features(dataframe, column='likes')
X_dislikes = select_features(dataframe, column='dislikes')
X_likes_dislikes = select_features(dataframe, columns=['likes','dislikes'])
X_char_word_qty = select_features(dataframe, columns=['char-qty','word-qty'])
X_likes_dislike_char_words = select_features(dataframe, columns=['likes','dislikes','char-qty','word-qty'])
X_tf_100_sexist_words = select_features(dataframe, column='tf')

In [7]:
from modules.grid_search import grid_search

# SVM

In [8]:
from sklearn.svm import SVC

# svm_params = dict(gamma=np.logspace(-9, 3, 3), C=np.logspace(-2, 10, 3))
svm_params = dict(gamma=[1.0], C=[10.0])

In [9]:
svm_tf_100_sexist_words = grid_search(
    './data/svm-tf_100_sexist_words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with TF for 100 sexist words',
    X_tf_100_sexist_words, y,
    # reload=True
)
svm_char_qty = grid_search(
    './data/svm-char-qty-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Char quantity',
    X_char_qty, y,
    # reload=True
)
svm_word_qty = grid_search(
    './data/svm-word-qty-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Word quantity',
    X_word_qty, y,
    # reload=True
)
svm_likes_qty = grid_search(
    './data/svm-likes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes quantity',
    X_likes, y,
    # reload=True
)
svm_likes_qty = grid_search(
    './data/svm-likes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes quantity',
    X_likes, y,
    # reload=True
)
svm_dislikes_qty = grid_search(
    './data/svm-likes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Dislikes quantity',
    X_dislikes, y,
    # reload=True
)
svm_likes_dislikes_qty = grid_search(
    './data/svm-likes-dislikes-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes and Dislikes quantity',
    X_likes_dislikes, y,
    # reload=True
)
svm_chars_words_qty = grid_search(
    './data/svm-chars-words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Chars and Words quantity',
    X_char_word_qty, y,
    # reload=True
)
svm_likes_dislikes_chars_words_qty = grid_search(
    './data/svm-likes-dislikes-chars-words-grid-search-model',
    SVC(),
    svm_params,
    'SVM with Likes, Dislikes, Chars and Words quantity',
    X_likes_dislike_char_words, y,
    # reload=True
)

Reading SVM with TF for 100 sexist words. Model
Model loaded in: @ 0 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.90
-------------------------------------------------------------------------------------------
Reading SVM with Char quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.51
-------------------------------------------------------------------------------------------
Reading SVM with Word quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.53
-------------------------------------------------------------------------------------------
Reading SVM with Likes quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'C': 10.0, 'gamma': 1.0} with a score of 0.57
-------------------------------------------------------------------------------------------
Reading SVM with Likes quantity. Model
Model loaded in: @ 0 seconds
T

# K-NN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = dict(n_neighbors=[3, 5, 11, 19],
                  weights=['uniform', 'distance'],
                  metric=['euclidean', 'manhattan'])

In [11]:
knn_tf_100_sexist_words = grid_search(
    './data/knn-tf_100_sexist_words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with TF for 100 sexist words',
    X_tf_100_sexist_words, y,
    # reload=True
)
knn_char_qty = grid_search(
    './data/knn-char-qty-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Char quantity',
    X_char_qty, y,
    # reload=True
)
knn_word_qty = grid_search(
    './data/knn-word-qty-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Word quantity',
    X_word_qty, y,
    # reload=True
)
knn_likes_qty = grid_search(
    './data/knn-likes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes quantity',
    X_likes, y,
    # reload=True
)
knn_likes_qty = grid_search(
    './data/knn-likes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes quantity',
    X_likes, y,
    # reload=True
)
knn_dislikes_qty = grid_search(
    './data/knn-likes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Dislikes quantity',
    X_dislikes, y,
    # reload=True
)
knn_likes_dislikes_qty = grid_search(
    './data/knn-likes-dislikes-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes and Dislikes quantity',
    X_likes_dislikes, y,
    # reload=True
)
knn_chars_words_qty = grid_search(
    './data/knn-chars-words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Chars and Words quantity',
    X_char_word_qty, y,
    # reload=True
)
knn_likes_dislikes_chars_words_qty = grid_search(
    './data/knn-likes-dislikes-chars-words-grid-search-model',
    KNeighborsClassifier(),
    knn_params,
    'KNN with Likes, Dislikes, Chars and Words quantity',
    X_likes_dislike_char_words, y,
    # reload=True
)

Reading KNN with TF for 100 sexist words. Model
Model loaded in: @ 0 seconds
The best parameters are {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'} with a score of 0.84
-------------------------------------------------------------------------------------------
Reading KNN with Char quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'} with a score of 0.52
-------------------------------------------------------------------------------------------
Reading KNN with Word quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'distance'} with a score of 0.52
-------------------------------------------------------------------------------------------
Reading KNN with Likes quantity. Model
Model loaded in: @ 0 seconds
The best parameters are {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'} with a score of 0.54
-----

# Classification Report

In [12]:
from modules.classification_report import print_means_report

In [13]:
print_means_report(X_tf_100_sexist_words, y, svm_tf_100_sexist_words, 'svm-tf-100-sexist-words')
print_means_report(X_char_qty, y, svm_char_qty, 'svm-char-qty')
print_means_report(X_word_qty, y, svm_word_qty, 'svm-word-qty')
print_means_report(X_char_word_qty, y, svm_chars_words_qty, 'svm-char_word_qty')
print_means_report(X_likes, y, svm_likes_qty, 'svm-likes-qty')
print_means_report(X_dislikes, y, svm_dislikes_qty, 'svm-dislikes-qty')
print_means_report(X_likes_dislikes, y, svm_likes_dislikes_qty, 'svm-likes-dislikes-qty')
print_means_report(X_likes_dislike_char_words, y, svm_likes_dislikes_chars_words_qty, 'svm-likes-dislikes-chars-words-qty')

>>> svm-tf-100-sexist-words scores
Precisão média: 0.90415809008716
Revocação média: 0.8999240681976353
Média de F1: 0.8965469507449179

>>> svm-char-qty scores
Precisão média: 0.5093150279206438
Revocação média: 0.5088941801423877
Média de F1: 0.5072531129457081

>>> svm-word-qty scores
Precisão média: 0.5173393812684861
Revocação média: 0.5172931281661959
Média de F1: 0.5131534326147634

>>> svm-char_word_qty scores
Precisão média: 0.5162768391411378
Revocação média: 0.5158539574414709
Média de F1: 0.5130471001438397

>>> svm-likes-qty scores
Precisão média: 0.5368480150585491
Revocação média: 0.5355515787735391
Média de F1: 0.5341910137380477

>>> svm-dislikes-qty scores
Precisão média: 0.5653160537399426
Revocação média: 0.5650314710267782
Média de F1: 0.5646688166192343

>>> svm-likes-dislikes-qty scores
Precisão média: 0.5433840653062836
Revocação média: 0.543082285437117
Média de F1: 0.5415225063733384

>>> svm-likes-dislikes-chars-words-qty scores
Precisão média: 0.582767686224

In [14]:
print_means_report(X_tf_100_sexist_words, y, knn_tf_100_sexist_words, 'knn-tf-100-sexist-words')
print_means_report(X_char_qty, y, knn_char_qty, 'knn-char-qty')
print_means_report(X_word_qty, y, knn_word_qty, 'knn-word-qty')
print_means_report(X_char_word_qty, y, knn_chars_words_qty, 'knn-char_word_qty')
print_means_report(X_likes, y, knn_likes_qty, 'knn-likes-qty')
print_means_report(X_dislikes, y, knn_dislikes_qty, 'knn-dislikes-qty')
print_means_report(X_likes_dislikes, y, knn_likes_dislikes_qty, 'knn-likes-dislikes-qty')
print_means_report(X_likes_dislike_char_words, y, knn_likes_dislikes_chars_words_qty, 'knn-likes-dislikes-chars-words-qty')

>>> knn-tf-100-sexist-words scores
Precisão média: 0.8524632071730709
Revocação média: 0.8464433535427002
Média de F1: 0.8423647163209533

>>> knn-char-qty scores
Precisão média: 0.5127469168498682
Revocação média: 0.5126623623766904
Média de F1: 0.5117391232774571

>>> knn-word-qty scores
Precisão média: 0.495064762061247
Revocação média: 0.49537114796351894
Média de F1: 0.49271623649340335

>>> knn-char_word_qty scores
Precisão média: 0.5257617642507662
Revocação média: 0.5255616457340314
Média de F1: 0.5251177371644631

>>> knn-likes-qty scores
Precisão média: 0.5319802505251762
Revocação média: 0.5295326516486474
Média de F1: 0.5216680510292385

>>> knn-dislikes-qty scores
Precisão média: 0.5613626739612589
Revocação média: 0.5596348967648155
Média de F1: 0.5569184667995835

>>> knn-likes-dislikes-qty scores
Precisão média: 0.5403327384107834
Revocação média: 0.5402858079818486
Média de F1: 0.539145931989379

>>> knn-likes-dislikes-chars-words-qty scores
Precisão média: 0.545503901