# Dependencies

In [1]:
import pandas as pd
import pandas as pd
import numpy as np

from modules.dataframe import Comments

# Importing labeled dataset

In [2]:
labeled_comments = pd.read_csv('data/labeled-comments.csv')
labeled_comments

Unnamed: 0,comment_id,content,likes,dislikes,votes,avg,std,label,char-qty,word-qty
0,1489,MUITO MAIS LEGAL RRSRSRRSRSRS,2.0,0.0,2,0.000000,0.00000,0,30,5
1,273,Canhão de guerra.,2.0,4.0,2,1.000000,0.00000,1,18,4
2,2574,femi o que?,10.0,11.0,2,1.000000,0.00000,1,11,3
3,951,Concordo plenamente Jaqueline!! Outro dia ouvi...,20.0,0.0,3,0.666667,0.57735,1,161,27
4,2520,Feminista é uma mulher encalhada que precisa d...,11.0,11.0,3,1.000000,0.00000,1,66,11
...,...,...,...,...,...,...,...,...,...,...
3566,55,Brigue esquisitinha,0.0,1.0,3,0.666667,0.57735,1,19,2
3567,3858,"Pois é, todo o bozopata é assim! Depois que pe...",3.0,0.0,3,0.333333,0.57735,0,192,35
3568,3984,Será que ninguém tem coragem de enfrentar algu...,0.0,0.0,1,0.000000,,0,368,60
3569,790,Perfeito!,2.0,2.0,3,0.333333,0.57735,0,9,1


<div class="alert alert-info">
    <b>Dataset fields description</b>
    <hline/>
    <p><b>comment_id</b>: unique identifier to each comment from database</p>
    <p><b>content</b>: comment text content</p>
    <p><b>likes</b>: comment likes quantity</p>
    <p><b>dislikes</b>: comment dislikes quantity</p>
    <p><b>votes</b>: number of users that labeled the comment</p>
    <p><b>avg</b>: average of each vote value to the comment</p>
    <p><b>std</b>: standard deviation of each vote value to the comment</p>
    <p><b>label</b>: final label assigned to the comment, label 1 represents sexist comments and label 0 represets not sexist comments</p>
    <p><b>char-qty</b>: number of characters in the comment </p>
    <p><b>word-qty</b>: number of words in the comment</p>
</div>

In [3]:
labeled_comments.describe()

Unnamed: 0,comment_id,likes,dislikes,votes,avg,std,label,char-qty,word-qty
count,3571.0,3323.0,3323.0,3571.0,3571.0,2886.0,3571.0,3571.0,3571.0
mean,1923.08261,15.611496,9.856455,2.534304,0.523321,0.230954,0.523663,140.439373,24.494539
std,1135.048496,40.102726,41.783877,1.032327,0.420923,0.277073,0.49951,178.98476,27.111817
min,4.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0
25%,988.5,2.0,0.0,2.0,0.0,0.0,0.0,47.0,9.0
50%,1886.0,5.0,1.0,3.0,0.666667,0.0,1.0,94.0,17.0
75%,2783.5,14.0,6.0,3.0,1.0,0.57735,1.0,179.0,32.0
max,4283.0,729.0,1196.0,7.0,1.0,0.57735,1.0,7050.0,819.0


<div class="alert alert-info">
A visual data analysis is avaliable at https://datastudio.google.com/s/sgO8X7JORMU
</div>

# Generating dataset features and information

In [23]:
from classifiers.features import Features

# The Comments class is responsable for structuring data about
# the comments, it's source code can be analized at:
# https://github.com/mlpbraga/sexism-detection-notebooks/blob/main/modules/dataframe.py
comments = Comments()

# The Features class is responsable for turning features structured
# above into an python objetect, it's source code can be analized at:
# https://github.com/mlpbraga/sexism-detection-notebooks/blob/main/classifiers/features.py
features = Features(comments.dataframe)

> No comments without label
> Loading local dataframe


In [13]:
comments.sexist_words.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
3844,mulheres,0.009752,0.006359,0,0.003393
3834,homens,0.006732,0.003903,0,0.002829
3836,ela,0.00748,0.005013,0,0.002466
3845,mulher,0.01004,0.007907,0,0.002133
3852,de,0.04085,0.038762,0,0.002089
3804,elas,0.002445,0.000471,0,0.001974
3795,feia,0.002129,0.000202,0,0.001927
3847,uma,0.011996,0.010195,0,0.001801
3841,as,0.008803,0.007066,0,0.001737
3837,na,0.007882,0.006191,0,0.001691


In [15]:
comments.not_sexist_words.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
3830,ser,0.005523,0.007907,0,-0.002384
3727,Brasil,0.001093,0.002692,0,-0.001599
3758,sua,0.001467,0.003028,0,-0.001561
3851,não,0.022007,0.023419,0,-0.001411
3372,comentários,0.000345,0.001581,0,-0.001236
3839,em,0.008487,0.00969,0,-0.001204
3732,pessoas,0.001122,0.002322,0,-0.0012
3550,sobre,0.000518,0.001716,0,-0.001198
3616,lei,0.00069,0.001817,0,-0.001127
3779,você,0.00164,0.002759,0,-0.001119


In [16]:
comments.sexist_bigrams.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
4213,as mulheres,0.009315,0.005705,0,0.00361
4203,diz que,0.00367,0.000423,0,0.003247
4201,que homens,0.003481,0.000423,0,0.003059
4208,que as,0.00461,0.001902,0,0.002709
4199,que vai,0.003293,0.000845,0,0.002448
4210,os homens,0.005081,0.002853,0,0.002228
4198,mesmo que,0.003199,0.001162,0,0.002037
4211,uma mulher,0.006022,0.004332,0,0.00169
4193,mulheres não,0.002635,0.000951,0,0.001684
4195,mulheres que,0.003011,0.001585,0,0.001426


In [17]:
comments.not_sexist_bigrams.head(15)

Unnamed: 0,word,sexist-freq,not-sexist-freq,undefined-freq,diff
4212,que não,0.00781,0.010037,0,-0.002227
3947,ser humano,0.000565,0.00243,0,-0.001865
3926,ou não,0.000565,0.002007,0,-0.001443
4175,no Brasil,0.001882,0.00317,0,-0.001288
4167,todos os,0.001694,0.002958,0,-0.001265
4112,as pessoas,0.001035,0.002219,0,-0.001184
4202,tem que,0.003481,0.004649,0,-0.001167
3408,pessoas que,0.000282,0.001373,0,-0.001091
2731,maioria dos,0.000188,0.001268,0,-0.00108
4121,que você,0.001035,0.002113,0,-0.001078


# Classification

In [25]:
from classifiers.svm import SVM
from classifiers.knn import KNN
from classifiers.rfc import RFC

svm_classifier = SVM()
svm_params = dict(gamma=[1.0], C=[10.0])
svm_classifier.train_models(svm_params, features)

knn_classifier = KNN()
knn_params = dict(n_neighbors=[3, 5, 11, 19],
                  weights=['uniform', 'distance'],
                  metric=['euclidean', 'manhattan'])
knn_classifier.train_models(knn_params, features)

rfc_classifier = RFC()
rfc_params = { 
    'n_estimators': [200],
    'max_features': ['auto'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['entropy']
}
rfc_classifier.train_models(rfc_params, features)

Reading SVM with TF of unigrams. Model
Executing Grid Search to SVM with TF of unigrams.
-------------------------------------------------------------------------------------------
Reading SVM with TF of 100 sexist unigrams. Model
Executing Grid Search to SVM with TF of 100 sexist unigrams.
-------------------------------------------------------------------------------------------
Reading SVM with TF of 100 not sexist unigrams. Model
Executing Grid Search to SVM with TF of 100 not sexist unigrams.
-------------------------------------------------------------------------------------------
Reading SVM with Char quantity. Model
Executing Grid Search to SVM with Char quantity.
-------------------------------------------------------------------------------------------
Reading SVM with Word quantity. Model
Executing Grid Search to SVM with Word quantity.
-------------------------------------------------------------------------------------------
Reading SVM with Likes quantity. Model
Executin

-------------------------------------------------------------------------------------------
Reading RFC with Sexist Bigrams TFs. Model
Executing Grid Search to RFC with Sexist Bigrams TFs.
-------------------------------------------------------------------------------------------
Reading RFC with Not Sexist Bigrams TFs. Model
Executing Grid Search to RFC with Not Sexist Bigrams TFs.
-------------------------------------------------------------------------------------------
Reading RFC with Unigrams TFs, Likes, Dislikes, Chars and Words quantity. Model
Executing Grid Search to RFC with Unigrams TFs, Likes, Dislikes, Chars and Words quantity.
-------------------------------------------------------------------------------------------


In [26]:
svm_classifier.report_results(features)

>>>> SVM with Unigrams TFs results
		 sexist 	 not-sexist
precision	 0.99593 	 0.91356
recall		 0.91390 	 0.99589
f1		 0.95310 	 0.95291

>>>> SVM with TF to 100 sexist unigrams results
		 sexist 	 not-sexist
precision	 0.98464 	 0.84894
recall		 0.83984 	 0.98563
f1		 0.90641 	 0.91214

>>>> SVM with TF to 100 not sexist unigrams results
		 sexist 	 not-sexist
precision	 0.97169 	 0.82199
recall		 0.80749 	 0.97419
f1		 0.88195 	 0.89161

>>>> SVM with Char quantity results
		 sexist 	 not-sexist
precision	 0.63525 	 0.64626
recall		 0.73048 	 0.53988
f1		 0.67950 	 0.58821

>>>> SVM with Word quantity results
		 sexist 	 not-sexist
precision	 0.56838 	 0.55446
recall		 0.68610 	 0.42845
f1		 0.62165 	 0.48324

>>>> SVM with Char and Word quantity results
		 sexist 	 not-sexist
precision	 0.79441 	 0.79767
recall		 0.82246 	 0.76628
f1		 0.80806 	 0.78147

>>>> SVM with Likes quantity results
		 sexist 	 not-sexist
precision	 0.58781 	 0.55249
recall		 0.60294 	 0.53666
f1		 0.59511 	

In [27]:
knn_classifier.report_results(features)

>>>> KNN with Unigrams TFs results
		 sexist 	 not-sexist
precision	 0.99766 	 0.91345
recall		 0.91364 	 0.99765
f1		 0.95375 	 0.95366

>>>> KNN with TF to 100 sexist unigrams results
		 sexist 	 not-sexist
precision	 0.98680 	 0.84834
recall		 0.83877 	 0.98768
f1		 0.90670 	 0.91267

>>>> KNN with TF to 100 not sexist unigrams results
		 sexist 	 not-sexist
precision	 0.97010 	 0.82135
recall		 0.80695 	 0.97273
f1		 0.88099 	 0.89062

>>>> KNN with Char quantity results
		 sexist 	 not-sexist
precision	 0.61479 	 0.60027
recall		 0.67433 	 0.53636
f1		 0.64309 	 0.56637

>>>> KNN with Word quantity results
		 sexist 	 not-sexist
precision	 0.54601 	 0.53698
recall		 0.75588 	 0.31056
f1		 0.63397 	 0.39327

>>>> KNN with Char and Word quantity results
		 sexist 	 not-sexist
precision	 0.60033 	 0.58013
recall		 0.65615 	 0.52082
f1		 0.62691 	 0.54873

>>>> KNN with Likes quantity results
		 sexist 	 not-sexist
precision	 0.55377 	 0.52182
recall		 0.63128 	 0.44164
f1		 0.58981 	

In [28]:
rfc_classifier.report_results(features)

>>>> RFC with Unigrams TFs results
		 sexist 	 not-sexist
precision	 0.97187 	 0.86255
recall		 0.85829 	 0.97273
f1		 0.91143 	 0.91424

>>>> RFC with TF to 100 sexist unigrams results
		 sexist 	 not-sexist
precision	 0.97276 	 0.81082
recall		 0.79198 	 0.97566
f1		 0.87293 	 0.88554

>>>> RFC with TF to 100 not sexist unigrams results
		 sexist 	 not-sexist
precision	 0.95241 	 0.78477
recall		 0.76016 	 0.95836
f1		 0.84542 	 0.86288

>>>> RFC with Char quantity results
		 sexist 	 not-sexist
precision	 0.54888 	 0.56347
recall		 0.80749 	 0.27214
f1		 0.65350 	 0.36683

>>>> RFC with Word quantity results
		 sexist 	 not-sexist
precision	 0.54366 	 0.55220
recall		 0.81604 	 0.24868
f1		 0.65252 	 0.34262

>>>> RFC with Char and Word quantity results
		 sexist 	 not-sexist
precision	 0.56985 	 0.61102
recall		 0.80722 	 0.33167
f1		 0.66803 	 0.42970

>>>> RFC with Likes quantity results
		 sexist 	 not-sexist
precision	 0.57065 	 0.54218
recall		 0.62861 	 0.48152
f1		 0.59805 	