In [30]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [31]:
comments = pd.read_csv('data/attack_pl.csv', encoding='ANSI', index_col=0)
annotations = pd.read_csv('data/attack_annotations.tsv',  sep='\t')

In [32]:
comments.head()

Unnamed: 0,comment,logged_in,ns,sample,split,year
37675,`- To nie jest&quot; twórcze &quot;. Są to def...,0.0,article,random,train,2002.0
44816,`:: pojęcie` `standardowy model`` samo w sobie...,0.0,article,random,train,2002.0
49851,"Prawda czy fałsz, sytuacja, która miała miejsc...",0.0,article,random,train,2002.0
89320,"Następnie możesz pracować nad tym, aby być mni...",1.0,article,random,dev,2002.0
93890,Ta strona będzie potrzebowała dwuznaczności.,1.0,article,random,train,2002.0


In [33]:
annotations.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0


In [34]:
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
# join labels and comments
comments['attack'] = labels
comments.head()

Unnamed: 0,comment,logged_in,ns,sample,split,year,attack
37675,`- To nie jest&quot; twórcze &quot;. Są to def...,0.0,article,random,train,2002.0,False
44816,`:: pojęcie` `standardowy model`` samo w sobie...,0.0,article,random,train,2002.0,False
49851,"Prawda czy fałsz, sytuacja, która miała miejsc...",0.0,article,random,train,2002.0,False
89320,"Następnie możesz pracować nad tym, aby być mni...",1.0,article,random,dev,2002.0,False
93890,Ta strona będzie potrzebowała dwuznaczności.,1.0,article,random,train,2002.0,False


In [35]:
comments['comment'] = comments['comment'].astype(str)
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65116 entries, 37675 to 312182972
Data columns (total 7 columns):
comment      65116 non-null object
logged_in    65116 non-null float64
ns           65116 non-null object
sample       65116 non-null object
split        65116 non-null object
year         65116 non-null float64
attack       65116 non-null bool
dtypes: bool(1), float64(2), object(4)
memory usage: 3.5+ MB


In [36]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

In [37]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features=None, ngram_range=(1, 3))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', LogisticRegression(verbose=True))
])
clf = clf.fit(train_comments['comment'], train_comments['attack'])
clf
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:, 1])
print('Test ROC AUC: %.3f' %auc)

[LibLinear]

Test ROC AUC: 0.916


In [38]:
from sklearn.externals import joblib

joblib.dump(clf, 'backend/resources/attack_pl.pkl')

['backend/resources/attack_pl.pkl']

In [39]:
clf.predict(['głupi'])

array([ True], dtype=bool)