In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
toxic_cmt = pd.read_table('_RawData/ConvAI/toxicity_annotated_comments.tsv')
toxic_annot = pd.read_table('_RawData/ConvAI/toxicity_annotations.tsv')
aggr_cmt = pd.read_table('_RawData/ConvAI/aggression_annotated_comments.tsv')
aggr_annot = pd.read_table('_RawData/ConvAI/aggression_annotations.tsv')
attack_cmt = pd.read_table('_RawData/ConvAI/attack_annotated_comments.tsv')
attack_annot = pd.read_table('_RawData/ConvAI/attack_annotations.tsv')

In [3]:
def JoinAndSanitize(cmt, annot):
    df = cmt.set_index('rev_id').join(annot.groupby(['rev_id']).mean())
    df = Sanitize(df)
    return df

In [4]:
def Sanitize(df):
    comment = 'comment' if 'comment' in df else 'comment_text'
    df[comment] = df[comment].str.lower().str.replace('newline_token', ' ')
    df[comment] = df[comment].fillna('erikov')
    return df

In [5]:
toxic = JoinAndSanitize(toxic_cmt, toxic_annot)
attack = JoinAndSanitize(attack_cmt, attack_annot)
aggression = JoinAndSanitize(aggr_cmt, aggr_annot)

In [6]:
len(attack), len(aggression)

(115864, 115864)

In [7]:
attack['comment'].equals(aggression['comment'])

True

In [8]:
attack['attack'].corr(aggression['aggression'])

0.97181029408703123

In [9]:
toxic.head()
#attack.head()
#aggression.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,worker_id,toxicity,toxicity_score
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2232.0,this: :one can make an analogy in mathematical...,2002,True,article,random,train,2101.2,0.1,0.4
4216.0,` :clarification for you (and zundark's righ...,2002,True,user,random,train,682.8,0.0,0.5
8953.0,elected or electoral? jhk,2002,False,article,random,test,2223.3,0.0,0.1
26547.0,`this is such a fun entry. devotchka i once...,2002,True,article,random,train,2617.4,0.0,0.6
28959.0,please relate the ozone hole to increases in c...,2002,True,article,random,test,2891.8,0.2,0.2


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def Tfidfize(df):
    # can tweak these as desired
    max_vocab = 200000
    split = 0.1

    comment = 'comment' if 'comment' in df else 'comment_text'
    
    tfidfer = TfidfVectorizer(ngram_range=(1,2), max_features=max_vocab,
                   use_idf=1, stop_words='english',
                   smooth_idf=1, sublinear_tf=1 )
    tfidf = tfidfer.fit_transform(df[comment])

    return tfidf, tfidfer

In [11]:
X_toxic, tfidfer_toxic = Tfidfize(toxic)
y_toxic = toxic['toxicity'].values
X_attack, tfidfer_attack = Tfidfize(attack)
y_attack = attack['attack'].values
X_aggression, tfidfer_aggression = Tfidfize(aggression)
y_aggression = aggression['aggression'].values

In [12]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

ridge = Ridge()
mse_toxic = -cross_val_score(ridge, X_toxic, y_toxic, scoring='neg_mean_squared_error')
mse_attack = -cross_val_score(ridge, X_attack, y_attack, scoring='neg_mean_squared_error')
mse_aggression = -cross_val_score(ridge, X_aggression, y_aggression, scoring='neg_mean_squared_error')

In [13]:
mse_toxic.mean(), mse_attack.mean(), mse_aggression.mean()

(0.021793672359200312, 0.025567422509219988, 0.027521045602787262)

In [14]:
model_toxic = ridge.fit(X_toxic, y_toxic)
model_attack = ridge.fit(X_attack, y_attack)
model_aggression = ridge.fit(X_aggression, y_aggression)

In [15]:
train_orig = pd.read_csv('_RawData/train.csv')
test_orig = pd.read_csv('_RawData/test.csv')

In [16]:
train_orig = Sanitize(train_orig)
test_orig = Sanitize(test_orig)

In [17]:
def TfidfAndPredict(tfidfer, model):
    tfidf_train = tfidfer.transform(train_orig['comment_text'])
    tfidf_test = tfidfer.transform(test_orig['comment_text'])
    train_scores = model.predict(tfidf_train)
    test_scores = model.predict(tfidf_test)
    
    return train_scores, test_scores

In [18]:
toxic_tr_scores, toxic_t_scores = TfidfAndPredict(tfidfer_toxic, model_toxic)

In [19]:
toxic_tr_scores.shape, toxic_t_scores.shape

((95851,), (226998,))

In [20]:
attack_tr_scores, attack_t_scores = TfidfAndPredict(tfidfer_attack, model_attack)

In [21]:
attack_tr_scores.shape, attack_t_scores.shape

((95851,), (226998,))

In [22]:
aggression_tr_scores, aggression_t_scores = TfidfAndPredict(tfidfer_aggression, model_aggression)

In [23]:
aggression_tr_scores.shape, aggression_t_scores.shape

((95851,), (226998,))

In [24]:
# toxic_level, to not be confused with original label 'toxic'
train_orig['toxic_level'] = toxic_tr_scores
train_orig['attack'] = attack_tr_scores
train_orig['aggression'] = aggression_tr_scores
test_orig['toxic_level'] = toxic_t_scores
test_orig['attack'] = attack_t_scores
test_orig['aggression'] = aggression_t_scores

In [26]:
train_orig.to_csv('_RawData/train_with_convai.csv', index=False)
test_orig.to_csv('_RawData/test_with_convai.csv', index=False)