# Dependencies

In [1]:
import pandas as pd
import pandas as pd
import numpy as np

from modules.dataframe import Comments

# Importing labeled dataset

In [2]:
labeled_comments = pd.read_csv('data/labeled-comments.csv')
labeled_comments

Unnamed: 0,comment_id,content,likes,dislikes,votes,avg,std,label,char-qty,word-qty
0,1489,MUITO MAIS LEGAL RRSRSRRSRSRS,2.0,0.0,2,0.000000,0.00000,0,30,5
1,273,Canhão de guerra.,2.0,4.0,2,1.000000,0.00000,1,18,4
2,2574,femi o que?,10.0,11.0,2,1.000000,0.00000,1,11,3
3,951,Concordo plenamente Jaqueline!! Outro dia ouvi...,20.0,0.0,3,0.666667,0.57735,1,161,27
4,2520,Feminista é uma mulher encalhada que precisa d...,11.0,11.0,3,1.000000,0.00000,1,66,11
...,...,...,...,...,...,...,...,...,...,...
3566,55,Brigue esquisitinha,0.0,1.0,3,0.666667,0.57735,1,19,2
3567,3858,"Pois é, todo o bozopata é assim! Depois que pe...",3.0,0.0,3,0.333333,0.57735,0,192,35
3568,3984,Será que ninguém tem coragem de enfrentar algu...,0.0,0.0,1,0.000000,,0,368,60
3569,790,Perfeito!,2.0,2.0,3,0.333333,0.57735,0,9,1


<div class="alert alert-info">
    <b>Dataset fields description</b>
    <hline/>
    <p><b>comment_id</b>: unique identifier to each comment from database</p>
    <p><b>content</b>: comment text content</p>
    <p><b>likes</b>: comment likes quantity</p>
    <p><b>dislikes</b>: comment dislikes quantity</p>
    <p><b>votes</b>: number of users that labeled the comment</p>
    <p><b>avg</b>: average of each vote value to the comment</p>
    <p><b>std</b>: standard deviation of each vote value to the comment</p>
    <p><b>label</b>: final label assigned to the comment, label 1 represents sexist comments and label 0 represets not sexist comments</p>
    <p><b>char-qty</b>: number of characters in the comment </p>
    <p><b>word-qty</b>: number of words in the comment</p>
</div>

In [3]:
labeled_comments.describe()

Unnamed: 0,comment_id,likes,dislikes,votes,avg,std,label,char-qty,word-qty
count,3571.0,3323.0,3323.0,3571.0,3571.0,2886.0,3571.0,3571.0,3571.0
mean,1923.08261,15.611496,9.856455,2.534304,0.523321,0.230954,0.523663,140.439373,24.494539
std,1135.048496,40.102726,41.783877,1.032327,0.420923,0.277073,0.49951,178.98476,27.111817
min,4.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0
25%,988.5,2.0,0.0,2.0,0.0,0.0,0.0,47.0,9.0
50%,1886.0,5.0,1.0,3.0,0.666667,0.0,1.0,94.0,17.0
75%,2783.5,14.0,6.0,3.0,1.0,0.57735,1.0,179.0,32.0
max,4283.0,729.0,1196.0,7.0,1.0,0.57735,1.0,7050.0,819.0


<div class="alert alert-info">
A visual data analysis is avaliable at https://datastudio.google.com/s/sgO8X7JORMU
</div>

# Generating dataset features and information

In [4]:
from classifiers.features import Features

# The Comments class is responsable for structuring data about
# the comments, it's source code can be analized at:
# https://github.com/mlpbraga/sexism-detection-notebooks/blob/main/modules/dataframe.py
comments = Comments()

# The Features class is responsable for turning features structured
# above into an python objetect, it's source code can be analized at:
# https://github.com/mlpbraga/sexism-detection-notebooks/blob/main/classifiers/features.py
features = Features(comments.dataframe)

> No comments without label
> Loading local dataframe


In [5]:
comments.sexist_words.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
3844,mulheres,0.009752,0.006359,0,0.003393
3834,homens,0.006732,0.003903,0,0.002829
3836,ela,0.00748,0.005013,0,0.002466
3845,mulher,0.01004,0.007907,0,0.002133
3852,de,0.04085,0.038762,0,0.002089
3804,elas,0.002445,0.000471,0,0.001974
3795,feia,0.002129,0.000202,0,0.001927
3847,uma,0.011996,0.010195,0,0.001801
3841,as,0.008803,0.007066,0,0.001737
3837,na,0.007882,0.006191,0,0.001691


In [6]:
comments.not_sexist_words.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
3830,ser,0.005523,0.007907,0,-0.002384
3727,Brasil,0.001093,0.002692,0,-0.001599
3758,sua,0.001467,0.003028,0,-0.001561
3851,não,0.022007,0.023419,0,-0.001411
3372,comentários,0.000345,0.001581,0,-0.001236
3839,em,0.008487,0.00969,0,-0.001204
3732,pessoas,0.001122,0.002322,0,-0.0012
3550,sobre,0.000518,0.001716,0,-0.001198
3616,lei,0.00069,0.001817,0,-0.001127
3779,você,0.00164,0.002759,0,-0.001119


In [7]:
comments.sexist_bigrams.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
4213,as mulheres,0.009315,0.005705,0,0.00361
4203,diz que,0.00367,0.000423,0,0.003247
4201,que homens,0.003481,0.000423,0,0.003059
4208,que as,0.00461,0.001902,0,0.002709
4199,que vai,0.003293,0.000845,0,0.002448
4210,os homens,0.005081,0.002853,0,0.002228
4198,mesmo que,0.003199,0.001162,0,0.002037
4211,uma mulher,0.006022,0.004332,0,0.00169
4193,mulheres não,0.002635,0.000951,0,0.001684
4195,mulheres que,0.003011,0.001585,0,0.001426


In [8]:
comments.not_sexist_bigrams.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
4212,que não,0.00781,0.010037,0,-0.002227
3947,ser humano,0.000565,0.00243,0,-0.001865
3926,ou não,0.000565,0.002007,0,-0.001443
4175,no Brasil,0.001882,0.00317,0,-0.001288
4167,todos os,0.001694,0.002958,0,-0.001265
4112,as pessoas,0.001035,0.002219,0,-0.001184
4202,tem que,0.003481,0.004649,0,-0.001167
3408,pessoas que,0.000282,0.001373,0,-0.001091
2731,maioria dos,0.000188,0.001268,0,-0.00108
4121,que você,0.001035,0.002113,0,-0.001078


# Classification

In [9]:
from classifiers.svm import SVM
from classifiers.knn import KNN
from classifiers.rfc import RFC

svm_classifier = SVM()
svm_params = dict(gamma=[1.0], C=[10.0])
svm_classifier.train_models(svm_params, features)

knn_classifier = KNN()
knn_params = dict(n_neighbors=[3, 5, 11, 19],
                  weights=['uniform', 'distance'],
                  metric=['euclidean', 'manhattan'])
knn_classifier.train_models(knn_params, features)

rfc_classifier = RFC()
rfc_params = { 
    'n_estimators': [200],
    'max_features': ['auto'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['entropy']
}
rfc_classifier.train_models(rfc_params, features)

Reading SVM with TF of unigrams. Model
-------------------------------------------------------------------------------------------
Reading SVM with TF of 100 sexist unigrams. Model
-------------------------------------------------------------------------------------------
Reading SVM with TF of 100 not sexist unigrams. Model
-------------------------------------------------------------------------------------------
Reading SVM with Char quantity. Model
-------------------------------------------------------------------------------------------
Reading SVM with Word quantity. Model
-------------------------------------------------------------------------------------------
Reading SVM with Likes quantity. Model
-------------------------------------------------------------------------------------------
Reading SVM with Dislikes quantity. Model
-------------------------------------------------------------------------------------------
Reading SVM with Likes and Dislikes quantity. Model
----

In [10]:
svm_classifier.report_results(features)

>>>> SVM with Unigrams TFs results
		 sexist 	 not-sexist
precision	 0.99080 	 0.86832
recall		 0.86283 	 0.99120
f1		 0.92236 	 0.92568

>>>> SVM with TF to 100 sexist unigrams results
		 sexist 	 not-sexist
precision	 0.97812 	 0.80696
recall		 0.78583 	 0.98065
f1		 0.87136 	 0.88530

>>>> SVM with TF to 100 not sexist unigrams results
		 sexist 	 not-sexist
precision	 0.96108 	 0.80386
recall		 0.78503 	 0.96510
f1		 0.86407 	 0.87707

>>>> SVM with Char quantity results
		 sexist 	 not-sexist
precision	 0.63246 	 0.64083
recall		 0.72433 	 0.53842
f1		 0.67519 	 0.58499

>>>> SVM with Word quantity results
		 sexist 	 not-sexist
precision	 0.57593 	 0.57259
recall		 0.70882 	 0.42757
f1		 0.63545 	 0.48943

>>>> SVM with Char and Word quantity results
		 sexist 	 not-sexist
precision	 0.79167 	 0.79814
recall		 0.82380 	 0.76217
f1		 0.80729 	 0.77958

>>>> SVM with Likes quantity results
		 sexist 	 not-sexist
precision	 0.59971 	 0.56191
recall		 0.60160 	 0.55982
f1		 0.60056 	

In [11]:
knn_classifier.report_results(features)

>>>> KNN with Unigrams TFs results
		 sexist 	 not-sexist
precision	 0.97126 	 0.87728
recall		 0.87594 	 0.97155
f1		 0.92109 	 0.92197

>>>> KNN with TF to 100 sexist unigrams results
		 sexist 	 not-sexist
precision	 0.77470 	 0.97960
recall		 0.98690 	 0.68475
f1		 0.86793 	 0.80576

>>>> KNN with TF to 100 not sexist unigrams results
		 sexist 	 not-sexist
precision	 0.77443 	 0.97619
recall		 0.98476 	 0.68504
f1		 0.86696 	 0.80490

>>>> KNN with Char quantity results
		 sexist 	 not-sexist
precision	 0.59060 	 0.57660
recall		 0.67353 	 0.48768
f1		 0.62925 	 0.52826

>>>> KNN with Word quantity results
		 sexist 	 not-sexist
precision	 0.55990 	 0.53671
recall		 0.66043 	 0.43079
f1		 0.60591 	 0.47772

>>>> KNN with Char and Word quantity results
		 sexist 	 not-sexist
precision	 0.59610 	 0.57440
recall		 0.65214 	 0.51496
f1		 0.62269 	 0.54277

>>>> KNN with Likes quantity results
		 sexist 	 not-sexist
precision	 0.57664 	 0.52225
recall		 0.50455 	 0.59384
f1		 0.53813 	

In [12]:
rfc_classifier.report_results(features)

>>>> RFC with Unigrams TFs results
		 sexist 	 not-sexist
precision	 0.97384 	 0.80506
recall		 0.78396 	 0.97683
f1		 0.86847 	 0.88258

>>>> RFC with TF to 100 sexist unigrams results
		 sexist 	 not-sexist
precision	 0.98751 	 0.74811
recall		 0.69572 	 0.99032
f1		 0.81619 	 0.85229

>>>> RFC with TF to 100 not sexist unigrams results
		 sexist 	 not-sexist
precision	 0.96120 	 0.72965
recall		 0.67193 	 0.97009
f1		 0.79070 	 0.83278

>>>> RFC with Char quantity results
		 sexist 	 not-sexist
precision	 0.55230 	 0.57777
recall		 0.81711 	 0.27361
f1		 0.65904 	 0.37100

>>>> RFC with Word quantity results
		 sexist 	 not-sexist
precision	 0.54717 	 0.56312
recall		 0.81845 	 0.25689
f1		 0.65580 	 0.35244

>>>> RFC with Char and Word quantity results
		 sexist 	 not-sexist
precision	 0.55380 	 0.59023
recall		 0.83342 	 0.26334
f1		 0.66538 	 0.36387

>>>> RFC with Likes quantity results
		 sexist 	 not-sexist
precision	 0.57858 	 0.54975
recall		 0.62647 	 0.49971
f1		 0.60149 	

### Wilcoxon statistical test

In [13]:
from scipy.stats import wilcoxon

def get_scores(model):
    scores = []
    for i in range(0,10):
        scores.append(model.cv_results_[f'split{i}_test_score'][0])
    return scores

In [26]:
svm_model_scores = {
    'tf_unigrams': get_scores(svm_classifier.model_tf_unigrams),
    'tf_sexist_unigrams': get_scores(svm_classifier.model_tf_sexist_unigrams),
    'tf_not_sexist_unigrams': get_scores(svm_classifier.model_tf_not_sexist_unigrams),
    'char_qty': get_scores(svm_classifier.model_char_qty),
    'word_qty': get_scores(svm_classifier.model_word_qty),
    'likes_qty': get_scores(svm_classifier.model_likes_qty),
    'dislikes_qty': get_scores(svm_classifier.model_dislikes_qty),
    'likes_dislikes_qty': get_scores(svm_classifier.model_likes_dislikes_qty),
    'likes_dislikes_chars_words_qty': get_scores(svm_classifier.model_likes_dislikes_chars_words_qty),
    'tf_unigrams_likes_dislikes_chars_words': get_scores(svm_classifier.model_tf_unigrams_likes_dislikes_chars_words),
    'tf_bigrams': get_scores(svm_classifier.model_tf_bigrams),
    'tf_sexist_bigrams': get_scores(svm_classifier.model_tf_sexist_bigrams),
    'tf_not_sexist_bigrams': get_scores(svm_classifier.model_tf_not_sexist_bigrams),
    'tf_unigrams_bigrams': get_scores(svm_classifier.model_tf_unigrams_bigrams),
}

knn_model_scores = {
    'tf_unigrams': get_scores(knn_classifier.model_tf_unigrams),
    'tf_sexist_unigrams': get_scores(knn_classifier.model_tf_sexist_unigrams),
    'tf_not_sexist_unigrams': get_scores(knn_classifier.model_tf_not_sexist_unigrams),
    'char_qty': get_scores(knn_classifier.model_char_qty),
    'word_qty': get_scores(knn_classifier.model_word_qty),
    'likes_qty': get_scores(knn_classifier.model_likes_qty),
    'dislikes_qty': get_scores(knn_classifier.model_dislikes_qty),
    'likes_dislikes_qty': get_scores(knn_classifier.model_likes_dislikes_qty),
    'likes_dislikes_chars_words_qty': get_scores(knn_classifier.model_likes_dislikes_chars_words_qty),
    'tf_unigrams_likes_dislikes_chars_words': get_scores(knn_classifier.model_tf_unigrams_likes_dislikes_chars_words),
    'tf_bigrams': get_scores(knn_classifier.model_tf_bigrams),
    'tf_sexist_bigrams': get_scores(knn_classifier.model_tf_sexist_bigrams),
    'tf_not_sexist_bigrams': get_scores(knn_classifier.model_tf_not_sexist_bigrams),
    'tf_unigrams_bigrams': get_scores(knn_classifier.model_tf_unigrams_bigrams),
}

rfc_model_scores = {
    'tf_unigrams': get_scores(rfc_classifier.model_tf_unigrams),
    'tf_sexist_unigrams': get_scores(rfc_classifier.model_tf_sexist_unigrams),
    'tf_not_sexist_unigrams': get_scores(rfc_classifier.model_tf_not_sexist_unigrams),
    'char_qty': get_scores(rfc_classifier.model_char_qty),
    'word_qty': get_scores(rfc_classifier.model_word_qty),
    'likes_qty': get_scores(rfc_classifier.model_likes_qty),
    'dislikes_qty': get_scores(rfc_classifier.model_dislikes_qty),
    'likes_dislikes_qty': get_scores(rfc_classifier.model_likes_dislikes_qty),
    'likes_dislikes_chars_words_qty': get_scores(rfc_classifier.model_likes_dislikes_chars_words_qty),
    'tf_unigrams_likes_dislikes_chars_words': get_scores(rfc_classifier.model_tf_unigrams_likes_dislikes_chars_words),
    'tf_bigrams': get_scores(rfc_classifier.model_tf_bigrams),
    'tf_sexist_bigrams': get_scores(rfc_classifier.model_tf_sexist_bigrams),
    'tf_not_sexist_bigrams': get_scores(rfc_classifier.model_tf_not_sexist_bigrams),
    'tf_unigrams_bigrams': get_scores(rfc_classifier.model_tf_unigrams_bigrams),
}

bert_model_scores = [
    0.571069182389937,
    0.45852187028657615,
    0.5364583333333334,
    0.5258964143426295,
    0.5313351498637601,
    0.4828571428571429,
    0.43333333333333335,
    0.5463659147869674,
    0.5493333333333332,
    0.5195822454308094
]


In [36]:
columns = ['bert_model_scores',
           'svm_tf_unigrams',
           'svm_tf_bigrams',
           'svm_tf_unigrams_bigrams',
           'svm_likes_dislikes_chars_words_qty',
           'svm_tf_unigrams_likes_dislikes_chars_words',
           'knn_tf_unigrams',
           'knn_likes_dislikes_chars_words_qty',
           'knn_tf_bigrams',
           'knn_tf_unigrams_bigrams',
           'rfc_tf_unigrams_bigrams',
          ]

In [37]:
results = []

def select_values(l):
    if 'svm' in l:
        return svm_model_scores[l.replace('svm_','')]
    if 'knn' in l:
        return knn_model_scores[l.replace('knn_','')]
    if 'rfc' in l:
        return rfc_model_scores[l.replace('rfc_','')]   
    if 'bert' in l:
        return bert_model_scores
    if 'baseline_svm' in l:
        return baseline_model_scores['svm']
    if 'baseline_lr' in l:
        return baseline_model_scores['lr']
i = 0
for l in columns:
    results.append([])
    values_y = select_values(l)
    for c in columns:
        values_x = select_values(c)
        if values_x == values_y:
            results[i].append((0, 0))
        else:
            test_stat, test_p = wilcoxon(values_y, values_x)
            results[i].append((test_stat, test_p))
    i += 1

In [38]:
pd.DataFrame(results, columns=columns, index=columns)

Unnamed: 0,bert_model_scores,svm_tf_unigrams,svm_tf_bigrams,svm_tf_unigrams_bigrams,svm_likes_dislikes_chars_words_qty,svm_tf_unigrams_likes_dislikes_chars_words,knn_tf_unigrams,knn_likes_dislikes_chars_words_qty,knn_tf_bigrams,knn_tf_unigrams_bigrams,rfc_tf_unigrams_bigrams
bert_model_scores,"(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(13.0, 0.13941397332153205)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
svm_tf_unigrams,"(0.0, 0.005062032126267864)","(0, 0)","(26.0, 0.8784817434328712)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
svm_tf_bigrams,"(0.0, 0.005062032126267864)","(26.0, 0.8784817434328712)","(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
svm_tf_unigrams_bigrams,"(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(3.0, 0.012515318690073973)"
svm_likes_dislikes_chars_words_qty,"(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
svm_tf_unigrams_likes_dislikes_chars_words,"(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
knn_tf_unigrams,"(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
knn_likes_dislikes_chars_words_qty,"(13.0, 0.13941397332153205)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0, 0)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)"
knn_tf_bigrams,"(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0, 0)","(21.0, 0.5076243443095237)","(0.0, 0.005062032126267864)"
knn_tf_unigrams_bigrams,"(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(0.0, 0.005062032126267864)","(21.0, 0.5076243443095237)","(0, 0)","(0.0, 0.005062032126267864)"


In [35]:
columns

['bert_model_scores',
 'svm_tf_unigrams',
 'svm_tf_bigrams',
 'svm_tf_unigrams_bigrams',
 'svm_likes_dislikes_chars_words_qty',
 'svm_tf_unigrams_likes_dislikes_chars_words',
 'knn_tf_unigrams',
 'knn_likes_dislikes_chars_words_qty',
 'knn_bigrams',
 'knn_unigrams_bigrams',
 'rfc_unigrams_bigrams']

In [20]:
knn_classifier.model

AttributeError: 'KNN' object has no attribute 'model'

In [21]:
knn_classifier.model_tf_unigrams

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=42, test_size=0.2,
            train_size=None),
             estimator=KNeighborsClassifier(), n_jobs=12,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 11, 19],
                         'weights': ['uniform', 'distance']},
             scoring='f1')

In [22]:
knn_classifier.model_tf_unigrams.__dict__

{'scoring': 'f1',
 'estimator': KNeighborsClassifier(),
 'n_jobs': 12,
 'refit': True,
 'cv': StratifiedShuffleSplit(n_splits=10, random_state=42, test_size=0.2,
             train_size=None),
 'verbose': 0,
 'pre_dispatch': '2*n_jobs',
 'error_score': nan,
 'return_train_score': False,
 'param_grid': {'n_neighbors': [3, 5, 11, 19],
  'weights': ['uniform', 'distance'],
  'metric': ['euclidean', 'manhattan']},
 'multimetric_': False,
 'best_index_': 9,
 'best_score_': 0.7970995007583548,
 'best_params_': {'metric': 'manhattan',
  'n_neighbors': 3,
  'weights': 'distance'},
 'best_estimator_': KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance'),
 'refit_time_': 0.00932765007019043,
 'scorer_': make_scorer(f1_score, average=binary),
 'cv_results_': {'mean_fit_time': array([0.02597923, 0.03327157, 0.04386549, 0.03305533, 0.03286896,
         0.02618849, 0.03169377, 0.02937942, 0.02656765, 0.03200812,
         0.02711742, 0.03788321, 0.0301183 , 0.03362391, 0.026756

In [25]:
rfc_classifier.model_tf_unigrams.__dict__

{'scoring': 'f1',
 'estimator': RandomForestClassifier(),
 'n_jobs': 12,
 'refit': True,
 'cv': StratifiedShuffleSplit(n_splits=10, random_state=42, test_size=0.2,
             train_size=None),
 'verbose': 0,
 'pre_dispatch': '2*n_jobs',
 'error_score': nan,
 'return_train_score': False,
 'param_grid': {'n_estimators': [200],
  'max_features': ['auto'],
  'max_depth': [4, 5, 6, 7, 8],
  'criterion': ['entropy']},
 'multimetric_': False,
 'best_index_': 4,
 'best_score_': 0.8638480599666533,
 'best_params_': {'criterion': 'entropy',
  'max_depth': 8,
  'max_features': 'auto',
  'n_estimators': 200},
 'best_estimator_': RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=200),
 'refit_time_': 0.44132161140441895,
 'scorer_': make_scorer(f1_score, average=binary),
 'cv_results_': {'mean_fit_time': array([0.98905756, 1.08937073, 1.20143616, 1.24455857, 1.1870702 ]),
  'std_fit_time': array([0.19201289, 0.08400931, 0.09635543, 0.16945437, 0.20167996]),
  'mean_score_time'