In [91]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing as pre
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.externals import joblib

In [59]:
# Preprocessing dataset comments hate wikipedia
df = pd.read_csv('../MachineLearning/DATASETS/Identificador de comentários toxicos/train.csv/train.csv')
df['is_toxic'] = df['identity_hate'] + df['insult'] + df['obscene'] + df['severe_toxic'] + df['threat'] + df['toxic']
df.drop(['identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic', 'id'], axis=1, inplace=True)
df['is_toxic'] = df['is_toxic'].apply(lambda x: 0 if x == 0 else 1)
df.head()

Unnamed: 0,comment_text,is_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [62]:
# Preprocessing dataset comments hate twitter
df2 = pd.read_csv('../NLP/labeled_data.csv')
df2.drop(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'], axis=1, inplace=True)
df2['class'] = df2['class'].apply(lambda x: 0 if x == 2 else 1)
df2['comment_text'] = df2['tweet']
df2['is_toxic'] = df2['class']
df2.drop(['class', 'tweet'], axis=1, inplace=True)
df2.head()

Unnamed: 0,comment_text,is_toxic
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


In [63]:
# Join datasets
df3 = pd.concat([df,df2], axis=0)
df3.head()

Unnamed: 0,comment_text,is_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [66]:
print(f'Number of hate comments: {len(df3[df3["is_toxic"] == 1])}')
print(f'Number of non-hateful comments: {len(df3[df3["is_toxic"] == 0])}')

Number of hate comments: 36845
Number of non-hateful comments: 147509


In [73]:
# Extracting features from text using TFiDF
X = df['comment_text']
y = df['is_toxic']

vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

In [95]:
with open('tfidf_vectorizer.pkl', 'wb') as fin:
    joblib.dump(vectorizer, fin)

In [85]:
print(vectorizer.get_stop_words())

frozenset({'five', 'may', 'my', 'about', 'enough', 'seem', 'is', 'one', 'somehow', 'cry', 'whither', 'whom', 'moreover', 'how', 'up', 'against', 'an', 'take', 'eight', 'etc', 'had', 'latter', 'but', 'formerly', 'into', 'no', 'seemed', 'much', 'almost', 'show', 'bottom', 'sincere', 'was', 'nor', 'might', 'now', 'somewhere', 'wherever', 'nobody', 'not', 'with', 'along', 'mill', 'she', 'becoming', 'besides', 'whoever', 'through', 'whereby', 'whereafter', 'however', 'nowhere', 'hundred', 'never', 'several', 'should', 'whatever', 'seeming', 'whence', 'interest', 'amount', 'by', 'throughout', 'everything', 'if', 'this', 'bill', 'fire', 'sometimes', 'seems', 'per', 'perhaps', 'became', 'fifty', 'hasnt', 'toward', 'herself', 'thereafter', 'becomes', 'upon', 'were', 'keep', 'at', 'eleven', 'de', 'his', 'thru', 'give', 'there', 'what', 'least', 'done', 'below', 'amongst', 'latterly', 'hers', 'of', 'move', 'become', 'thereby', 'her', 'none', 'also', 'describe', 'them', 'that', 'everywhere', 'co',

In [87]:
vectorizer.vocabulary_

{'explanation': 62425,
 'edits': 56947,
 'username': 174749,
 'hardcore': 76786,
 'metallica': 107782,
 'fan': 63714,
 'reverted': 141473,
 'weren': 180154,
 'vandalisms': 175621,
 'just': 91508,
 'closure': 39000,
 'gas': 70200,
 'voted': 178116,
 'new': 115856,
 'york': 185478,
 'dolls': 53793,
 'fac': 63076,
 'don': 53911,
 'remove': 139889,
 'template': 164243,
 'talk': 162732,
 'page': 123573,
 'retired': 141226,
 '89': 8806,
 '205': 3503,
 '38': 5730,
 '27': 4536,
 'aww': 22077,
 'matches': 105297,
 'background': 22670,
 'colour': 40130,
 'seemingly': 148463,
 'stuck': 158915,
 'thanks': 165094,
 '21': 3845,
 '51': 6819,
 'january': 89152,
 '11': 1040,
 '2016': 3376,
 'utc': 174937,
 'hey': 78867,
 'man': 103742,
 'really': 137622,
 'trying': 170113,
 'edit': 56839,
 'war': 178969,
 'guy': 75330,
 'constantly': 42205,
 'removing': 139904,
 'relevant': 139583,
 'information': 85398,
 'talking': 162781,
 'instead': 86154,
 'care': 33942,
 'formatting': 67291,
 'actual': 11502,
 'in

In [90]:
print(vectorizer.idf_)

[  6.49561527   5.89434905  10.78302593 ...,  12.28710333  12.28710333
  12.28710333]


In [92]:
text_clf =  MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=1)
scores = cross_val_score(text_clf, X_tfidf, y, cv=cv, scoring='f1_macro')
print(f'F1 Score: {scores.mean()} (+/- {scores.std() * 2})')

F1 Score: 0.8698825479448024 (+/- 0.009330115733128645)


In [93]:
joblib.dump(text_clf, 'model_mlp_cls.pkl')

['model_mlp_cls.pkl']

In [4]:
!pip install googletrans

Collecting googletrans
  Downloading https://files.pythonhosted.org/packages/89/70/8df7605661051e678e097bbbb6e226583fa0f5742ad1d46aaf1be85217d2/googletrans-2.2.0.tar.gz
Building wheels for collected packages: googletrans
  Running setup.py bdist_wheel for googletrans ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/03/67/83/38f65b30bb5ca9296e5045c50a3f4876eec8df0c47adaca385
Successfully built googletrans
Installing collected packages: googletrans
Successfully installed googletrans-2.2.0
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [8]:
from googletrans import Translator
translator = Translator()
translator.translate('Olá mundo')
# <Translated src=ko dest=en text=Good evening. pronunciation=Good evening.>
trans = translator.translate('Olá mundo', dest='en')
trans.text

'Hello World'

In [11]:
detect = translator.detect('이 문장은 한글로 쓰여졌습니다.')
detect.lang

'ko'