In [None]:
import nltk
import pandas as pd, numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, chi2, SelectKBest
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.naive_bayes import BernoulliNB
import pickle

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
subm = pd.read_csv('./sample_submission.csv')

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

## TF-IDF data

In [None]:
with open('trn_term_doc.pkl', 'rb') as input:
    trn_term_doc = pickle.load(input)
    
with open('test_term_doc.pkl', 'rb') as input:
    test_term_doc = pickle.load(input)

## NE Recognition

In [None]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    return continuous_chunk

my_sent = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
get_continuous_chunks(my_sent)

## Language detection

In [None]:
from langdetect import detect, lang_detect_exception

detect("Ja sam Lovro")

## Combining it all

In [None]:
X_train = trn_term_doc.copy()
X_test = test_term_doc.copy()

selected_features = []
test_selected_features = []

for i, j in enumerate(label_cols):
    y = train[j].values
    selector = SelectKBest(chi2,k=500000)
    ts = selector.fit_transform(X_train,y)
    tested = selector.transform(X_test)
    selected_features.append(ts)
    test_selected_features.append(tested)

In [None]:
train_lang_detect = []
train_ne = []

i==0
for txt in train[COMMENT]:
    if i % 1000 == 0 : print(i)
    i += 1
    try:
        if detect(txt) != 'en':
            train_lang_detect.append(10)
        else:
            train_lang_detect.append(0)
    except:
        train_lang_detect.append(0)
        pass
    
    train_ne.append(len(get_continuous_chunks(txt)))
    
train_lang_detect = np.asarray(train_lang_detect)
train_ne = np.asarray(train_ne)

In [None]:
with open('extra_features_lang.pkl', 'wb') as output:
    pickle.dump(train_lang_detect, output, pickle.HIGHEST_PROTOCOL)

with open('extra_features_ne.pkl', 'wb') as output:
    pickle.dump(train_ne, output, pickle.HIGHEST_PROTOCOL)

In [None]:
with open('extra_features_lang.pkl', 'rb') as input:
    train_lang_detect = pickle.load(input)

with open('extra_features_ne.pkl', 'rb') as input:
    train_ne = pickle.load(input)

In [None]:
from scipy.sparse import hstack

lang_dec_T = np.array([train_lang_detect]).T
ne_T = np.array([train_ne]).T

for i in range(len(selected_features)):
    s = selected_features[i]
    s = hstack((s,ne_T))
    s = hstack((s,lang_dec_T))
    selected_features[i] = s

## Results

In [None]:
def nested_kfold_cv(clf, param_grid, X, y, k1=3, k2=3):
    
    acc, precision, recall, f1 = [],[],[],[]
    kfold = KFold(n_splits=k1, shuffle=True, random_state=42)
    
    # Outer loop
    for ind_train, ind_test in kfold.split(X):
        
        X_train, y_train, X_test, y_test = X[ind_train], y[ind_train], X[ind_test], y[ind_test]
        
        # Inner loop
        inn = GridSearchCV(clf, param_grid, cv=k2, n_jobs = -1).fit(X_train, y_train)
        
        # Prediction based on the best selected params, the ones that minimize average error
        h = inn.best_estimator_.fit(X_train, y_train).predict(X_test)
        
        acc.append(accuracy_score(y_test, h))
        precision.append(precision_score(y_test, h))
        recall.append(recall_score(y_test, h))
        f1.append(f1_score(y_test, h))
        
    return np.mean(acc), np.mean(precision), np.mean(recall), np.mean(f1)

### Evaluation LogReg

In [None]:
Cs = [2**i for i in range(-5,6)]
param = [{'C': Cs}]

avg_acc, avg_precision, avg_recall, avg_f1 = [],[],[],[]
for i, j in enumerate(label_cols):
    print('fit', j)
    acc, precision, recall, f1 = nested_kfold_cv(LogisticRegression(class_weight="balanced"),param, selected_features[i].tocsc(), train[j].values)
    avg_acc.append(acc)
    avg_precision.append(precision)
    avg_recall.append(recall)
    avg_f1.append(f1)

In [None]:
acc = np.mean(avg_acc)
precision =  np.mean(avg_precision)
recall = np.mean(avg_recall)
f1 = np.mean(avg_f1)

print(acc,precision,recall,f1)

### Evaluation SVM

In [None]:
Cs = [2**i for i in range(-5,6)]
param = [{'C': Cs}]

avg_acc, avg_precision, avg_recall, avg_f1 = [],[],[],[]
for i, j in enumerate(label_cols):
    print('fit', j)
    acc, precision, recall, f1 = nested_kfold_cv(LinearSVC(class_weight="balanced"),param, selected_features[i].tocsc(), train[j].values)
    avg_acc.append(acc)
    avg_precision.append(precision)
    avg_recall.append(recall)
    avg_f1.append(f1)

In [None]:
acc = np.mean(avg_acc)
precision =  np.mean(avg_precision)
recall = np.mean(avg_recall)
f1 = np.mean(avg_f1)

print(acc,precision,recall,f1)