## CS725: Foundations of Machine Learning Project
-------

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Import Train and Test Data
---

In [0]:
import pandas as pd
sample_submission = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv")
test_labels = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

### LSTM Model
---


In [None]:

import  numpy as np
import pandas as pd
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import LSTM, Dense, Dropout, Input, Embedding, Bidirectional, GlobalMaxPool1D

embd_file='../input/glove840b300dtxt/glove.840B.300d.txt'
train_file='../input/jigsaw-toxic-comment-classification-challenge/train.csv'
test_file='../input/jigsaw-toxic-comment-classification-challenge/test.csv'
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

max_words_to_take=1000000

train_data= pd.read_csv(train_file)
test_data= pd.read_csv(test_file)

#x=test_data['comment_text'][500]
ytrain=train_data.values[:,2:]

def preprocess(x):
    #text_to_word_sequence(x, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
    x=x.lower()
    #x='''aksv asdjasd HS "@#$^^''''$%_)(*& c    sds'''.split()
    x= re.sub(r"[^a-z0-9,*,!.'-@\"]", " ", x)
    x= re.sub(r"\s{2,}", " ", x)
    return x


xtrain=train_data['comment_text'].map(lambda x: preprocess(x))
xtest=test_data['comment_text'].map(lambda x: preprocess(x))

train_comment=list(xtrain)
test_comment=list(xtest)
tokenizer = Tokenizer(num_words=max_words_to_take)

#Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

tokenizer.fit_on_texts(train_comment+test_comment)
train_sequences = tokenizer.texts_to_sequences(train_comment)
test_sequences = tokenizer.texts_to_sequences(test_comment)

word_index=tokenizer.word_index
word_index_items=word_index.items()
index_to_word = {v: k for k, v in  word_index_items}
print(len(word_index))

average_length = np.mean([len(x) for x in train_sequences])#60
median_length = sorted([len(x) for x in train_sequences])[len(train_sequences) // 2]#30
max_length= np.max([len(x) for x in train_sequences])#2142

print("Average sequence length: ", average_length)
print("Median sequence length: ", median_length)

max_sequence_length=150

x_train = sequence.pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
x_test = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

y_train=ytrain

print('X_train shape: ', x_train.shape)

embd_index = {}

f = open(embd_file,encoding="utf8")
for line in f:
    values=line.split()
    #word=values[0]
    #vec=np.array(values[1:]).astype(np.float32)
    word = ' '.join(values[:-300])
    vec= np.asarray(values[-300:], dtype='float32')
    #vec= vec.reshape(-1)
    embd_index[word]=vec
f.close

embd_num_words = min(max_words_to_take, len(word_index) + 1)#0 not in word index
embd_dim=300

# from keras documentation
embd_matrix= np.zeros((embd_num_words,embd_dim))
for word, i in  word_index_items:
    if i > max_words_to_take :
        continue
    embd_vec=embd_index.get(word)
    if embd_vec is not None:
        #word not found in embedding index will be all-zeros
        embd_matrix[i]=embd_vec
print('Null word embeddings: %d' % np.sum(np.sum(embd_matrix, axis=1) == 0))


def get_model():
    hidden_size=200
    xinput=Input(shape=(max_sequence_length,),dtype="int32")
    xembd=Embedding(embd_num_words,embd_dim, weights=[embd_matrix],input_length=max_sequence_length)(xinput)
    xBlstm=Bidirectional(LSTM(hidden_size,return_sequences=True))(xembd)
    xPool=GlobalMaxPool1D()(xBlstm)
    
    x=Dense(75,activation='relu')(xPool)
    x=Dropout(0.1)(x)
    predictions=Dense(6,activation='sigmoid')(x)# can be sigmoid
    model=Model(inputs=xinput,outputs=predictions)
    model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])#crossentropy
    return model

    
    
model= get_model()
batch_size=256
epochs=1

from keras.callbacks import ModelCheckpoint, EarlyStopping

file_path="weights_bestEpoch.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
hist=model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)
best_score = min(hist.history['val_loss'])
print("out of ",epochs,"epochs best(minimum) validation-loss obtained is",best_score )


y_test_predict = model.predict(x_test,batch_size=1024,verbose=1)



predict=pd.DataFrame(data=y_test_predict,columns=CLASSES)


test_ids=test_data["id"].values
predict["id"]=test_ids.reshape((len(test_ids),1))
predict=predict[["id"]+CLASSES]


predict.to_csv("sample_submission.csv", index=False)



### 2D-CNN


In [None]:
import  numpy as np
import pandas as pd
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import concatenate, CuDNNGRU


from keras.layers import Dense, Dropout, Input, Embedding,GlobalMaxPool1D

train_file='data/train2.csv'
test_file='data/test.csv'
embd_file='data/glove.840B.300d.txt'
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
max_words_to_take=1000000
max_sequence_length=200

train_data= pd.read_csv(train_file)
test_data= pd.read_csv(test_file)

#x=test_data['comment_text'][500]
ytrain=train_data.values[:,2:]

def preprocess(x):
    #text_to_word_sequence(x, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
    x=x.lower()
    #x='''aksv asdjasd HS "@#$^^''''$%_)(*& c    sds'''.split()
    x= re.sub(r"[^a-z0-9,*,!.'-@\"]", " ", x)
    x= re.sub(r"\s{2,}", " ", x)
    return x


xtrain=train_data['comment_text'].map(lambda x: preprocess(x))
xtest=test_data['comment_text'].map(lambda x: preprocess(x))

def text_to_commentList(x):
    return x


test_comment=list(xtrain)
train_comment=list(xtest)
tokenizer = Tokenizer(num_words=max_words_to_take)

#Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

tokenizer.fit_on_texts(train_comment+test_comment)
train_sequences = tokenizer.texts_to_sequences(train_comment)
test_sequences = tokenizer.texts_to_sequences(test_comment)

word_index=tokenizer.word_index
word_index_items=word_index.items()
index_to_word = {v: k for k, v in  word_index_items}
print(len(word_index))

average_length = np.mean([len(x) for x in train_sequences])#60
median_length = sorted([len(x) for x in train_sequences])[len(train_sequences) // 2]#30
max_length= np.max([len(x) for x in train_sequences])#2142

print("Average sequence length: ", average_length)
print("Median sequence length: ", median_length)



x_train = sequence.pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
x_test = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

y_train=ytrain

print('X_train shape: ', x_train.shape)

embd_index = {}

f = open(embd_file,encoding="utf8")
for line in f:
    values=line.split()
    #word=values[0]
    #vec=np.array(values[1:]).astype(np.float32)
    word = ' '.join(values[:-300])
    vec= np.asarray(values[-300:], dtype='float32')
    #vec= vec.reshape(-1)
    embd_index[word]=vec
f.close

embd_num_words = min(max_words_to_take, len(word_index) + 1)#0 not in word index
embd_dim=300

# from keras documentation
embd_matrix= np.zeros((embd_num_words,embd_dim))
for word, i in  word_index_items:
    if i > max_words_to_take :
        continue
    embd_vec=embd_index.get(word)
    if embd_vec is not None:
        #word not found in embedding index will be all-zeros
        embd_matrix[i]=embd_vec
print('Null word embeddings: %d' % np.sum(np.sum(embd_matrix, axis=1) == 0))


def get_model():
    #hidden_size=200
    xinput=Input(shape=(max_sequence_length,),dtype="int32")
    xembd=Embedding(embd_num_words,embd_dim, weights=[embd_matrix],input_length=max_sequence_length)(xinput)
    x1=SpatialDropout1D(0.4)(xembd)
    x1= Reshape((200, 300, 1))(x1)
    
    filter_sizes=[1,2,3,5]
    num_filters=32
    pooled = []
    for i in filter_sizes:
        conv= Conv2D(num_filters, kernel_size=(i, 300), kernel_initializer='normal',
                    activation='elu')(x1)
        #maxpool_pre = MaxPool2D(pool_size=(maxlen - i + 1, 1))(conv_pre)
        #avepool_pre = AveragePooling2D(pool_size=(maxlen - i + 1, 1))(conv_pre)
        globalmax= GlobalMaxPooling2D()(conv_pre)
        pooled.append(globalmax)
    x1 = Concatenate(axis=1)(pooled)   
    x1 = Dropout(0.2)(x1)

    predictions=Dense(6,activation='sigmoid')(x1)# can be sigmoid
    model=Model(inputs=xinput,outputs=predictions)
    model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])#crossentropy
    return model
    
    
    
model= get_model()
batch_size=256
epochs=1

from keras.callbacks import ModelCheckpoint, EarlyStopping

file_path="weights_bestEpoch.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
hist=model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)
best_score = min(hist.history['val_loss'])
print("out of ",epochs,"epochs best(minimum) validation-loss obtained is",best_score )


y_test_predict = model.predict(xtrain,batch_size=1024,verbose=1)



predict=pd.DataFrame(data=y_test_predict,columns=CLASSES)


test_ids=test_data["id"].values
predict["id"]=test_ids.reshape((len(test_ids),1))
predict=predict[["id"]+CLASSES]


predict.to_csv("sample_submission.csv", index=False)

### Logistic Regression
---

In [None]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score, KFold
from scipy.sparse import hstack
from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod(
            (datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' %
              (thour, tmin, round(tsec, 2)))

# Data processing was done as in Bojan's fork of the original script:
# https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

traintime = timer(None)
train_time = timer(None)
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv').fillna(' ')
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv').fillna(' ')
tr_ids = train[['id']]
train[class_names] = train[class_names].astype(np.int8)
target = train[class_names]

print(' Cleaning ...')
# PREPROCESSING PART
repl = {
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " frown ",
    ":(": " frown ",
    ":s": " frown ",
    ":-s": " frown ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

keys = [i for i in repl.keys()]

new_train_data = []
new_test_data = []
ltr = train["comment_text"].tolist()
lte = test["comment_text"].tolist()
for i in ltr:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_train_data.append(xx)
for i in lte:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_test_data.append(xx)
train["new_comment_text"] = new_train_data
test["new_comment_text"] = new_test_data

trate = train["new_comment_text"].tolist()
tete = test["new_comment_text"].tolist()
for i, c in enumerate(trate):
    trate[i] = re.sub('[^a-zA-Z ?!]+', '', str(trate[i]).lower())
for i, c in enumerate(tete):
    tete[i] = re.sub('[^a-zA-Z ?!]+', '', tete[i])
train["comment_text"] = trate
test["comment_text"] = tete
del trate, tete
train.drop(["new_comment_text"], axis=1, inplace=True)
test.drop(["new_comment_text"], axis=1, inplace=True)

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])
timer(train_time)

train_time = timer(None)
print(' Part 1/2 of vectorizing ...')
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
timer(train_time)

train_time = timer(None)
print(' Part 2/2 of vectorizing ...')
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
timer(train_time)

train_features = hstack([train_char_features, train_word_features]).tocsr()
test_features = hstack([test_char_features, test_word_features]).tocsr()
timer(traintime)

all_parameters = {
                  'C'             : [1.048113, 0.1930, 0.596362, 0.25595, 0.449843, 0.25595],
                  'tol'           : [0.1, 0.1, 0.046416, 0.0215443, 0.1, 0.01],
                  'solver'        : ['lbfgs', 'newton-cg', 'lbfgs', 'newton-cg', 'newton-cg', 'lbfgs'],
                  'fit_intercept' : [True, True, True, True, True, True],
                  'penalty'       : ['l2', 'l2', 'l2', 'l2', 'l2', 'l2'],
                  'class_weight'  : [None, 'balanced', 'balanced', 'balanced', 'balanced', 'balanced'],
                 }

folds = 3
scores = []
scores_classes = np.zeros((len(class_names), folds))

submission = pd.DataFrame.from_dict({'id': test['id']})
submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
#skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
kf = KFold(n_splits=folds, shuffle=True, random_state=239)

idpred = tr_ids

traintime = timer(None)
for j, (class_name) in enumerate(class_names):
#    train_target = train[class_name]

    classifier = LogisticRegression(
        C=all_parameters['C'][j],
        max_iter=200,
        tol=all_parameters['tol'][j],
        solver=all_parameters['solver'][j],
        fit_intercept=all_parameters['fit_intercept'][j],
        penalty=all_parameters['penalty'][j],
        dual=False,
        class_weight=all_parameters['class_weight'][j],
        verbose=0)

    avreal = target[class_name]
    lr_cv_sum = 0
    lr_pred = []
    lr_fpred = []
    lr_avpred = np.zeros(train.shape[0])

    train_time = timer(None)
    for i, (train_index, val_index) in enumerate(kf.split(train_features)):
        X_train, X_val = train_features[train_index], train_features[val_index]
        y_train, y_val = target.loc[train_index], target.loc[val_index]

        classifier.fit(X_train, y_train[class_name])
        scores_val = classifier.predict_proba(X_val)[:, 1]
        lr_avpred[val_index] = scores_val
        lr_y_pred = classifier.predict_proba(test_features)[:, 1]
        scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)
        print('\n Fold %02d class %s AUC: %.6f' % ((i+1), class_name, scores_classes[j][i]))

        if i > 0:
            lr_fpred = lr_pred + lr_y_pred
        else:
            lr_fpred = lr_y_pred

        lr_pred = lr_fpred

    lr_cv_score = (lr_cv_sum / folds)
    lr_oof_auc = roc_auc_score(avreal, lr_avpred)
    print('\n Average class %s AUC:\t%.6f' % (class_name, np.mean(scores_classes[j])))
    print(' Out-of-fold class %s AUC:\t%.6f' % (class_name, lr_oof_auc))
    timer(train_time)

    submission[class_name] = lr_pred / folds
    submission_oof['prediction_' + class_name] = lr_avpred

print('\n Overall AUC:\t%.6f' % (np.mean(scores_classes)))
submission.to_csv('submission-tuned-LR-01.csv', index=False)
submission_oof.to_csv('oof-tuned-LR-01.csv', index=False)
timer(traintime)
