In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',400)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
from collections import Counter
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.util import ngrams

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras import backend as K
# from tensorflow.keras.engine import InputSpec, Layer
from tensorflow.keras.optimizers import Adam


from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

### Read and compute main stats

In [None]:
train_df = pd.read_csv("../data/toxic/train.csv", sep=",")
test_df = pd.read_csv("../data/toxic/test.csv", sep=",")

In [None]:
print(train_df.shape)
train_df.head(20)

In [None]:
print(test_df.shape)
test_df.head(20)

In [None]:
train_df.info()

In [None]:
train_df.describe()

### y distribution

In [None]:
print("number toxic is {0:.0f} or {1:.2f}%".format(train_df["toxic"].sum(),
                                                   train_df["toxic"].sum()/train_df.shape[0]))
print("number severe_toxic is {0:.0f} or {1:.2f}%".format(train_df["severe_toxic"].sum(),
                                              train_df["severe_toxic"].sum()/train_df.shape[0]))
print("number obscene is {0:.0f} or {1:.2f}%".format(train_df["obscene"].sum(),
                                         train_df["obscene"].sum()/train_df.shape[0]))
print("number threat is {0:.0f} or {1:.2f}%".format(train_df["threat"].sum(), 
                                        train_df["threat"].sum()/train_df.shape[0]))
print("number insult is {0:.0f} or {1:.2f}%".format(train_df["insult"].sum(), 
                                        train_df["insult"].sum()/train_df.shape[0]))
print("number identity_hate is {0:.0f} or {1:.2f}%".format(train_df["identity_hate"].sum(),
                                               train_df["identity_hate"].sum()/train_df.shape[0]))

In [None]:
y = train_df[["toxic","severe_toxic","obscene","threat","insult", "identity_hate"]]
normal = [1- row.any() for index, row in y.iterrows()]
train_df["normal"] = normal

### sentence analisys

In [None]:
text = ' '.join(train_df['comment_text'].values).lower()
text = text.split()
counter = Counter(text)
print(counter.most_common()[:50])
counter.most_common()[-50:]

In [None]:
text = ' '.join(train_df['comment_text'].values).lower()
text = [i for i in text.split() if i not in stopwords.words('english') and i not in string.punctuation]
counter = Counter(text)
print(counter.most_common()[:50])
counter.most_common()[-50:]

In [None]:
smile_selector = ["\n\n" in str_.lower() for str_ in train_df['comment_text'].values]
train_df[smile_selector]

### Tokenization

In [None]:
def del_all_nonalphabetic_from_word(word:str)->str:
    word = re.sub('[^A-Za-z0-9]+', '', word)
    word = word.strip()
    return word

def del_all_nonalphabetic_from_list(words:list)->list:
    words = [del_all_nonalphabetic_from_word(word) for word in words]
    words = [word for word in words if word]
    return words

In [None]:
tokenizer = TweetTokenizer()
tokenized_train = train_df.comment_text.apply(tokenizer.tokenize)
# tokenized_train = tokenized_train.apply(lambda list_: [word.lower() for word in list_])
tokenized_train = tokenized_train.apply(del_all_nonalphabetic_from_list)


In [None]:
all_words = [word for words in tokenized_train for word in words]
print(len(set(all_words)))
word_counter = Counter(all_words)

### Vectorizer (tf-idf)

In [None]:
tokenizer = TweetTokenizer()
vectorizer = TfidfVectorizer(ngram_range=(1, 1), tokenizer=tokenizer.tokenize, max_features=10000)
vectorizer.fit(train_df['comment_text'])
train_vectorized = vectorizer.transform(train_df['comment_text'])
test_vectorized = vectorizer.transform(test_df['comment_text'])
print(train_vectorized.shape)

In [None]:
y = train_df['normal']

### Classical machine learning

In [None]:
names = ["LogisticRegression", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]

classifiers = [LogisticRegression(),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier()]

In [None]:
for i, cls in enumerate(classifiers):
    print(names[i], "result")
    cls.fit(train_vectorized, y)
    scores = cross_val_score(cls, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=2)
    print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
logreg = LogisticRegression()
# ovr = OneVsRestClassifier(logreg)

In [None]:
# %%time
# ovr.fit(train_vectorized, y)
# scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
# print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
%%time
logreg.fit(train_vectorized, y)
scores = cross_val_score(logreg, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
sum(logreg.predict(train_vectorized))/train_vectorized.shape[0]

In [None]:
%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
# ovr.fit(train_vectorized, y);
# svc.fit(train_vectorized, y);

# Deep learning

In [None]:
tk = Tokenizer(lower = True, oov_token="UNK") 
tk.fit_on_texts(train_df['comment_text'])

In [None]:
embedding_path = "crawl-300d-2M.vec"
embed_size = 300
max_features = 30000
tk.num_words = max_features+1

In [None]:
train_tokenized = tk.texts_to_sequences(train_df['comment_text'])
test_tokenized = tk.texts_to_sequences(test_df['comment_text'])

In [None]:
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [None]:
len(embedding_index), len(tk.word_index)

In [None]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: break
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
# ohe = OneHotEncoder(sparse=False)
# y_ohe = ohe.fit_transform(y.values.reshape(-1, 1))

In [None]:
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    x = Embedding(30001, embed_size, weights = [embedding_matrix], trainable = False)(inp)

#     x_gru = Bidirectional(GRU(units, return_sequences = True))(x)
#     x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
#     avg_pool1_gru = GlobalAveragePooling1D()(x1)
#     max_pool1_gru = GlobalMaxPooling1D()(x1)
    
#     x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
#     avg_pool3_gru = GlobalAveragePooling1D()(x3)
#     max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(LSTM(units, return_sequences = True))(x)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(rate=1-dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(rate=1-dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [None]:
model1 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_units=32, dr=0.1, conv_size=32)

In [None]:
pred = model1.predict(X_test, batch_size = 1024, verbose = 1)