In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/ktjqdataanalysis"))

# Any results you write to the current directory are saved as output.

**IMPORTING LIBRARIES TO BE USED**

In [None]:
#Importing Libraries to be used

import random
import keras
import re
import nltk
from nltk.corpus import stopwords
from scipy.interpolate import spline
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.models import Model
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from keras.layers import GlobalMaxPool1D, Dropout
from keras.layers import TimeDistributed
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, Bidirectional, Embedding,SimpleRNN, Dense, Flatten, Input, Conv1D, MaxPooling1D, CuDNNGRU, concatenate, CuDNNLSTM
from keras.layers import AveragePooling1D, BatchNormalization
from sklearn.metrics import f1_score
from keras.engine.topology import Layer
from keras.callbacks import ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers
from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import gc
gc.enable()

**READING IN GIVEN DATA**

In [None]:
given_data = pd.read_csv('../input/ktjqdataanalysis/dataset_convertedTocsv.csv')
test_data = pd.read_json('../input/test-set/test_set.json')

**CONVERTING EVERY ROW TO STRING, AND INTO LOWERCASE (SOME EMBEDDINGS DON'T UNDERSTAND CAPS)**

In [None]:
given_data['headline']=given_data['headline'].progress_apply(lambda x: str(x).lower())
given_data['short_description']=given_data['short_description'].progress_apply(lambda x: str(x).lower())

test_data['headline']=test_data['headline'].progress_apply(lambda x: str(x).lower())
test_data['short_description']=test_data['short_description'].progress_apply(lambda x: str(x).lower())

**REMOVING PUNCTUATIONS**

In [None]:
#Removing Punctuations

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
punct = "/-'!#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text


given_data['headline'] = given_data['headline'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
given_data['short_description'] = given_data['short_description'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))

test_data['headline'] = test_data['headline'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
test_data['short_description'] = test_data['short_description'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [None]:
#CONVERTING CATEGORIES INTO NUMERICAL ID

given_data['category_id']=given_data['category'].factorize()[0]
category_id_df = given_data[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
given_data.head()

In [None]:
#MERGING HEADLINES AND SHORT DESCRIPTIONS INTO A SINGLE COL IN GIVEN_DATA

merged=[]
for row in range(len(given_data)):
    temp_word=''
    headline=str(given_data.headline[row])
    shortDescription=str(given_data.short_description[row])
    temp_information=headline + ' ' + shortDescription
    merged.append(temp_information)

given_data['merged_headline_shortDescription']=merged

merged=[]
for row in range(len(test_data)):
    temp_word=''
    headline=str(test_data.headline[row])
    shortDescription=str(test_data.short_description[row])
    temp_information=headline + ' ' + shortDescription
    merged.append(temp_information)
    
test_data['merged_headline_shortDescription']=merged

In [None]:
#REMOVING NUMBERS, SYMBOLS AND STOPWORDS. REDUCING EVERYTHING TO ITS "LEMMA" FORM

from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmetizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def get_words(text):
    text_only_letters = re.sub('[^a-zA-Z]', ' ', text)
    words = nltk.word_tokenize(text_only_letters.lower())
    stops = set(stopwords.words('english'))
    meaningful_words = [lemmetizer.lemmatize(w) for w in words if w not in stops]
    return ' '.join(meaningful_words)

given_data['merged_headline_shortDescription'] = given_data['merged_headline_shortDescription'].progress_apply(lambda x: get_words(x))
test_data['merged_headline_shortDescription'] = test_data['merged_headline_shortDescription'].progress_apply(lambda x: get_words(x))

**STARTING TRYING OUT DEEP MODELS**

**CONVERTION TO ARRAYS, TOKENIZING & PADDING**

In [None]:
labels = given_data['category_id']
max_len = 20
max_words = 100000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(np.concatenate([np.array(given_data['merged_headline_shortDescription']), np.array(test_data['merged_headline_shortDescription'])], axis=0))
given_data_merged_sequences=tokenizer.texts_to_sequences(given_data['merged_headline_shortDescription'])
test_data_merged_sequences=tokenizer.texts_to_sequences(test_data['merged_headline_shortDescription'])
given_data_merged_seq_padded = pad_sequences(given_data_merged_sequences, maxlen=max_len, padding='post')
test_data_merged_seq_padded = pad_sequences(test_data_merged_sequences, maxlen=max_len, padding='post')
word_index = tokenizer.word_index
print('Shape of given data_merged tensor: ' + str(given_data_merged_seq_padded.shape))
print('Shape of test data_merged tensor: ' + str(test_data_merged_seq_padded.shape))
print('Shape of label tensor: ' + str(labels.shape))
gc.collect()

**EXTRACTING EMBEDDING MATRIX, TO BE USED AS INITIAL EMBEDDING WEIGHTS IN EMBEDDING LAYER.**

In [None]:
#GLOVE EMBEDDING EXTRACTION

# EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'
# def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
# embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
# all_embs = np.stack(embeddings_index.values())
# emb_mean,emb_std = all_embs.mean(), all_embs.std()
# embed_size = all_embs.shape[1]
# del all_embs

# # word_index = tokenizer.word_index
# nb_words = min(max_words, len(word_index))
# glove_embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words+1, embed_size))
# for word, i in word_index.items():
#     if i >= max_words: continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: glove_embedding_matrix[i] = embedding_vector        

# del embeddings_index
# gc.collect()

In [None]:
# EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
# def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
# embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
# emb_mean,emb_std = 0.0, 1.0
# embed_size = 300
# nb_words = min(max_words, len(word_index))
# paragram_embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words+1, embed_size))
# for word, i in word_index.items():
#     if i >= max_words: continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: paragram_embedding_matrix[i] = embedding_vector   
        
# del embeddings_index
# gc.collect()

**CONVERTING LABELS INTO ONE-HOT ENCODING FORM**

In [None]:
# onehot_labels = np.zeros((200853, 41))
# onehot_labels[np.arange(200853), labels] = 1

**SPLITTING INTO TRAIN AND TEST SETS**

In [None]:
# #Defining the split into train, validation and test sets.
# training_samples = 160682
# validation_samples = 0
# test_samples = (200853-160682)

# #shuffling
# indices = np.arange(given_data_merged_seq_padded.shape[0])
# np.random.shuffle(indices)
# given_data_merged_seq_padded = given_data_merged_seq_padded[indices]
# #given_shortDescription_data = given_headline_data[indices]
# onehot_labels = onehot_labels[indices]

# #x_headline_train = given_headline_data[:training_samples]
# #x_shortDescription_train = given_shortDescription_data[:training_samples]
# x_merged_train = given_data_merged_seq_padded[:training_samples]
# y_train = onehot_labels[:training_samples]

# #x_validation = qns_data[training_samples: training_samples + validation_samples]
# #y_validation = labels[training_samples: training_samples + validation_samples]

# #x_headline_test = given_headline_data[training_samples + validation_samples: training_samples + validation_samples + test_samples]
# #x_shortDescription_test = given_shortDescription_data[training_samples + validation_samples: training_samples + validation_samples + test_samples]
# x_merged_test = given_data_merged_seq_padded[training_samples + validation_samples: training_samples + validation_samples + test_samples]
# y_test = onehot_labels[training_samples + validation_samples: training_samples + validation_samples + test_samples]

**DEFINATION OF F1 SCORE**

In [None]:
# #Defination of f1 score
# def f1(y_true, y_pred):
#     def recall(y_true, y_pred):
#         """Recall metric.

#         Only computes a batch-wise average of recall.

#         Computes the recall, a metric for multi-label classification of
#         how many relevant items are selected.
#         """
#         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
#         recall = true_positives / (possible_positives + K.epsilon())
#         return recall

#     def precision(y_true, y_pred):
#         """Precision metric.

#         Only computes a batch-wise average of precision.

#         Computes the precision, a metric for multi-label classification of
#         how many selected items are relevant.
#         """
#         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#         precision = true_positives / (predicted_positives + K.epsilon())
#         return precision
#     precision = precision(y_true, y_pred)
#     recall = recall(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall+K.epsilon()))

**MODEL ARCHITECTURE(S)**

In [None]:
# #MODEL 2_glove ARCHITECTURE (BIDIRECTIONAL LSTMs, DROPOUT LAYERS AND A SINGLE RESIDUAL CONNECTION)

# inputTensor_merged = Input(shape = (max_len, ))
# outputTensor_1 = Embedding(len(word_index)+1, 300, weights = [glove_embedding_matrix], trainable=True)(inputTensor_merged)
# outputTensor_1=SpatialDropout1D(0.2)(outputTensor_1)
# outputTensor_2 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_1)
# outputTensor_2 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_2)
# outputTensor_2 = Dropout(0.1)(outputTensor_2)
# outputTensor_3 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_2)
# outputTensor_3 = Dropout(0.4)(outputTensor_3)
# outputTensor_3= concatenate([outputTensor_3, outputTensor_1], axis=2)
# outputTensor_3= Bidirectional(CuDNNLSTM(units=64, return_sequences=False))(outputTensor_3)
# outputTensor_3 = Dropout(0.4)(outputTensor_3)
# outputTensor_4 = Dense(units=41, activation='softmax')(outputTensor_3)
# outputTensor_final = outputTensor_4

# model2_glove = Model(inputTensor_merged, outputTensor_final)
# model2_glove.summary()

In [None]:
# #MODEL 2_paragram ARCHITECTURE (BIDIRECTIONAL LSTMs, DROPOUT LAYERS AND A SINGLE RESIDUAL CONNECTION)

# inputTensor_merged = Input(shape = (max_len, ))
# outputTensor_1 = Embedding(len(word_index)+1, 300, weights = [paragram_embedding_matrix], trainable=True)(inputTensor_merged)
# outputTensor_1=SpatialDropout1D(0.2)(outputTensor_1)
# outputTensor_2 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_1)
# outputTensor_2 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_2)
# outputTensor_2 = Dropout(0.1)(outputTensor_2)
# outputTensor_3 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_2)
# outputTensor_3 = Dropout(0.4)(outputTensor_3)
# outputTensor_3= concatenate([outputTensor_3, outputTensor_1], axis=2)
# outputTensor_3= Bidirectional(CuDNNLSTM(units=64, return_sequences=False))(outputTensor_3)
# outputTensor_3 = Dropout(0.4)(outputTensor_3)
# outputTensor_4 = Dense(units=41, activation='softmax')(outputTensor_3)
# outputTensor_final = outputTensor_4

# model2_paragram = Model(inputTensor_merged, outputTensor_final)
# model2_paragram.summary()

In [None]:
# #MODEL 4_glove ARCHITECTURE(DEEP LSTM&GRU WITHOUT TRAINING EMBEDDING)

# inputTensor_merged = Input(shape = (max_len, ))
# outputTensor_1 = Embedding(len(word_index)+1, 300, weights = [glove_embedding_matrix], trainable=False)(inputTensor_merged)
# outputTensor_1 = SpatialDropout1D(0.1)(outputTensor_1)
# outputTensor_1 = BatchNormalization()(outputTensor_1)
# outputTensor_2 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_1)
# outputTensor_3 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_2)
# outputTensor_3 = SpatialDropout1D(0.2)(outputTensor_3)
# outputTensor_3 = concatenate([outputTensor_2, outputTensor_3, outputTensor_1], axis=2)
# outputTensor_3 = BatchNormalization()(outputTensor_3)
# outputTensor_4 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_3)
# outputTensor_4 = SpatialDropout1D(0.3)(outputTensor_4)
# outputTensor_5 = Bidirectional(CuDNNLSTM(units=64, return_sequences=True))(outputTensor_4)
# outputTensor_5 = SpatialDropout1D(0.5)(outputTensor_5)
# outputTensor_5 = concatenate([outputTensor_5, outputTensor_2, outputTensor_4], axis=2)
# outputTensor_5 = BatchNormalization()(outputTensor_5)
# outputTensor_6 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_5)
# outputTensor_7 = Bidirectional(CuDNNLSTM(units=64, return_sequences=True))(outputTensor_6)
# outputTensor_7 = SpatialDropout1D(0.2)(outputTensor_7)
# outputTensor_8 = Bidirectional(CuDNNLSTM(units=32, return_sequences=False))(outputTensor_7)
# outputTensor_9 = Dense(units=41, activation='softmax')(outputTensor_8)

# model4_glove = Model(inputTensor_merged, outputTensor_9)
# model4_glove.summary()

In [None]:
# #MODEL 4_paragram ARCHITECTURE(DEEP LSTM&GRU WITHOUT TRAINING EMBEDDING)

# inputTensor_merged = Input(shape = (max_len, ))
# outputTensor_1 = Embedding(len(word_index)+1, 300, weights = [glove_embedding_matrix], trainable=False)(inputTensor_merged)
# outputTensor_1 = SpatialDropout1D(0.1)(outputTensor_1)
# outputTensor_1 = BatchNormalization()(outputTensor_1)
# outputTensor_2 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_1)
# outputTensor_3 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_2)
# outputTensor_3 = SpatialDropout1D(0.2)(outputTensor_3)
# outputTensor_3 = concatenate([outputTensor_2, outputTensor_3, outputTensor_1], axis=2)
# outputTensor_3 = BatchNormalization()(outputTensor_3)
# outputTensor_4 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_3)
# outputTensor_4 = SpatialDropout1D(0.3)(outputTensor_4)
# outputTensor_5 = Bidirectional(CuDNNLSTM(units=64, return_sequences=True))(outputTensor_4)
# outputTensor_5 = SpatialDropout1D(0.5)(outputTensor_5)
# outputTensor_5 = concatenate([outputTensor_5, outputTensor_2, outputTensor_4], axis=2)
# outputTensor_5 = BatchNormalization()(outputTensor_5)
# outputTensor_6 = Bidirectional(CuDNNLSTM(units=128, return_sequences=True))(outputTensor_5)
# outputTensor_7 = Bidirectional(CuDNNLSTM(units=64, return_sequences=True))(outputTensor_6)
# outputTensor_7 = SpatialDropout1D(0.2)(outputTensor_7)
# outputTensor_8 = Bidirectional(CuDNNLSTM(units=32, return_sequences=False))(outputTensor_7)
# outputTensor_9 = Dense(units=41, activation='softmax')(outputTensor_8)

# model4_paragram = Model(inputTensor_merged, outputTensor_9)
# model4_paragram.summary()

**COMPILING AND FITTING CELLS**

In [None]:
# #compiling and fitting cell for model 2_glove
# model2_glove.compile(optimizer = 'adam', metrics = ['acc'], loss = 'categorical_crossentropy')
# gc.collect()
# model2_glove.fit(x_merged_train, y_train, verbose=2, validation_split=0.1, batch_size=128, epochs=2)

# #compiling and fitting cell for model 2_paragram
# model2_paragram.compile(optimizer = 'adam', metrics = ['acc'], loss = 'categorical_crossentropy')
# gc.collect()
# model2_paragram.fit(x_merged_train, y_train, verbose=2, validation_split=0.1, batch_size=128, epochs=2)

# #compiling and fitting cell for model 4_glove
# model4_glove.compile(optimizer = 'adam', metrics = ['acc'], loss = 'categorical_crossentropy')
# gc.collect()
# model4_glove.fit(x_merged_train, y_train, verbose=2, validation_split=0.1, batch_size=64, epochs=4)

# #compiling and fitting cell for model 4_paragram
# model4_paragram.compile(optimizer = 'adam', metrics = ['acc'], loss = 'categorical_crossentropy')
# gc.collect()
# model4_paragram.fit(x_merged_train, y_train, verbose=2, validation_split=0.1, batch_size=64, epochs=4)

In [None]:
# #DETERMINATION OF BEST THRESHOLD FOR ROUNDING OFF PREDICTION

# y_predict_x_test=(5*model2_paragram.predict(x_merged_test, verbose=2)+5*model2_glove.predict(x_merged_test, verbose=2)+4*model4_glove.predict(x_merged_test, verbose=2)+4*model4_paragram.predict(x_merged_test, verbose=2))/18
# threshold=0
# best_score=0
# for threshold in np.arange(0, 1, 0.01):
#     score=f1_score((y_predict_x_test>threshold)*1, y_test, average='weighted')
#     #print('Threshold: '+ str(threshold) + '  Corresponding F1 score: ' + str(score))
#     if score > best_score:
#             best_threshold = threshold
#             best_score = score
# print('Best threshold found: '+ str(best_threshold) + ' with a corresponding F1 score of '+ str(best_score))

In [None]:
# #SAVING MODEL 2 AND MODEL 4 

# #MODEL 2
# model2_glove.save('Model2_GloVe.h5')
# model2_paragram.save('Model2_paragram.h5')

# #MODEL 4
# model4_glove.save('Model4_GloVe.h5')
# model4_paragram.save('Model4_paragram.h5')

In [None]:
#IMPORTING TRAINED MODELS FOR PREDICTION ON TEST DATA
from keras.models import load_model

Model2_GloVe = load_model('../input/trainedmodels/Model2_GloVe.h5')
Model2_paragram = load_model('../input/trainedmodels/Model2_paragram.h5')
Model4_GloVe = load_model('../input/trainedmodels/Model4_GloVe.h5')
Model4_paragram = load_model('../input/trainedmodels/Model4_paragram.h5')

In [None]:
#PREDICTING ON TEST DATA

x_merged_test_act=test_data_merged_seq_padded
y_predict_x_test_act=(Model2_paragram.predict(x_merged_test_act, verbose=1)+Model2_GloVe.predict(x_merged_test_act, verbose=1)+Model4_GloVe.predict(x_merged_test_act, verbose=1)+Model4_paragram.predict(x_merged_test_act, verbose=1))/4

In [None]:
y_predict_x_test_act_category_id=np.argmax(y_predict_x_test_act, axis=1)
y_predict = pd.DataFrame({'category_id': y_predict_x_test_act_category_id})
y_predict['category'] = y_predict['category_id'].map(id_to_category)
y_predict['id']=test_data['id']

y_predict=y_predict[['id', 'category']]

y_predict.to_json('submission.json', orient='records')