In [1]:
import nltk
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

In [2]:
# nltk.download()

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
subm = pd.read_csv('./sample_submission.csv')

In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


In [6]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
import re, string
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [8]:
def tokenize(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text).split()
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in letters_only if not w in stops]  
    return meaningful_words

In [9]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [10]:
stemmer = SnowballStemmer("english")
lemmatiser = WordNetLemmatizer()

def analyze(doc, analyzer):
    stemmed = [stemmer.stem(w) for w in analyzer(doc)]
#     tokens = analyzer(doc)
#     tagged = [(p[0], get_wordnet_pos(p[1])) for p in pos_tag(tokens) if get_wordnet_pos(p[1]) != '' ]
#     lemmatised = [lemmatiser.lemmatize(w[0], pos=w[1]) for w in tagged]
#     return (lemmatised)
    return (stemmed)

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: analyze(doc, analyzer)

In [11]:
print(train[COMMENT][23])
tokens = tokenize(train[COMMENT][23])
stemmed = [stemmer.stem(w) for w in tokens]
tagged = [(p[0], get_wordnet_pos(p[1])) for p in pos_tag(tokens) if get_wordnet_pos(p[1]) != '' ]
lemmatised = [lemmatiser.lemmatize(w[0], pos=w[1]) for w in tagged]

print(stemmed)
print(lemmatised)

"

 The Signpost: 24 September 2012 

 Read this Signpost in full
 Single-page
 Unsubscribe
   
"
['the', 'signpost', 'septemb', 'read', 'signpost', 'full', 'singl', 'page', 'unsubscrib']
['Signpost', 'September', 'Read', 'Signpost', 'full', 'Single', 'page', 'Unsubscribe']


In [11]:
n = train.shape[0]
vec = StemmedTfidfVectorizer(analyzer = "word",   \
                             tokenizer = tokenize,    \
                             preprocessor = None, \
                             ngram_range=(1,2))

trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [12]:
with open('trn_term_doc.pkl', 'wb') as output:
    pickle.dump(trn_term_doc, output, pickle.HIGHEST_PROTOCOL)

with open('test_term_doc.pkl', 'wb') as output:
    pickle.dump(test_term_doc, output, pickle.HIGHEST_PROTOCOL)

In [7]:
with open('trn_term_doc.pkl', 'rb') as input:
    trn_term_doc = pickle.load(input)
    
with open('test_term_doc.pkl', 'rb') as input:
    test_term_doc = pickle.load(input)

## NB

In [18]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()
x = trn_term_doc
test_x = test_term_doc

def naive_bayes(y):
    y = y.values
    return clf.fit(x,y)

In [15]:
NB_preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    nb = naive_bayes(train[j])
    NB_preds[:,i] = nb.predict_proba(test_x)[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


## SVM

In [None]:
from sklearn.svm import LinearSVC

x = trn_term_doc
test_x = test_term_doc
SVM = lambda y: LinearSVC().fit(x,y.values)

In [None]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    svm = SVM(train[j])
    preds[:,i] = svm.predict_proba(test_x)[:,1]

## LogReg

In [18]:
x = trn_term_doc
test_x = test_term_doc
logreg = lambda y: LogisticRegression(dual=True, C=4).fit(x,y.values)

In [19]:
LogReg_preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    lr = logreg(train[j])
    LogReg_preds[:,i] = lr.predict_proba(test_x)[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


## Ensemble methods

In [16]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
x = trn_term_doc
test_x = test_term_doc

clf = MultinomialNB()
ada = AdaBoostClassifier(base_estimator=clf, n_estimators=50)
dec_tree = lambda y: ada.fit(x, y.values)

In [17]:
RF_preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    dt = dec_tree(train[j])
    RF_preds[:,i] = dt.predict_proba(test_x)[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [41]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)

# Feature Selection

In [4]:
from sklearn.feature_selection import SelectPercentile, chi2, SelectKBest

In [8]:
X_train = trn_term_doc.copy()
X_test = test_term_doc.copy()

selected_features = []
test_selected_features = []

for i, j in enumerate(label_cols):
    y = train[j].values
    selector = SelectKBest(chi2,k=500000)
    ts = selector.fit_transform(X_train,y)
    tested = selector.transform(X_test)
    selected_features.append(ts)
    test_selected_features.append(tested)

  if np.issubdtype(mask.dtype, np.int):
