In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


In [74]:
train_data=pd.read_csv(r'C:\Users\mahendra\Desktop\analytics\projects\Janatahack Independence Day 2020 ML Hackathon\train.csv')
test_data=pd.read_csv(r'C:\Users\mahendra\Desktop\analytics\projects\Janatahack Independence Day 2020 ML Hackathon\test.csv')

In [75]:
train_data['text_col'] = train_data['TITLE'].str.cat(train_data['ABSTRACT'],sep=" ")
test_data['text_col'] = test_data['TITLE'].str.cat(test_data['ABSTRACT'],sep=" ")

train_data1=train_data.drop(columns=['TITLE','ABSTRACT'])
test_data1=test_data.drop(columns=['TITLE','ABSTRACT'])

In [77]:
X_train=train_data1['text_col']
y_train=train_data1.drop(columns=['text_col','ID'])

X_test=test_data1['text_col']

In [53]:
maxLen = len(max(X_train, key=len).split())

In [54]:
#preprocess fuction

#Text Lowercase:
def text_lowercase(text): 
    return text.lower()

# Remove numbers 
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result
#Remove punctuation
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator)
# remove whitespace from text 
def remove_whitespace(text): 
    return  " ".join(text.split())
# Remove default stopwords,Stemming,Lemmatization
def stop_words(text): 
    word_tokens = word_tokenize(text) 
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words_remove = [word for word in word_tokens if word not in stop_words] #Remove default stopwords
    filtered_word= [WordNetLemmatizer().lemmatize(word, pos ='v') for word in stop_words_remove]
    return filtered_word
def remove_duplicate(list_word):
    return (list(set(list_word)))

In [55]:
def pre_processing(text):
    text_lower=text_lowercase(text)
    text_num=remove_numbers(text_lower)
    text_puc=remove_punctuation(text_num)
    text_wht=remove_whitespace(text_puc)
    text_st=stop_words(text_wht)
    text_rm=remove_duplicate(text_st)
    return(text_rm)

In [56]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices =np.zeros((m,max_len))
    
    for i in range(m):                             # loop over training examples
        sentence_words =pre_processing(X[i])
        j = 0  # Initialize j to 0
        for w in sentence_words:
            if w not in list(word_to_index.keys()):
                w= 'unkown_word'
            X_indices[i, j] = word_to_index[w]
            j = j+1
    
    return X_indices

In [10]:
embeddings_dict = {}
with open('glove.6B.300d.txt',encoding="utf8") as f: 
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
word_to_vec_map=pd.DataFrame(embeddings_dict)

In [57]:
word_to_vec_map['unkown_word']=word_to_vec_map.apply(lambda x: 0, axis=1)

In [62]:
#define vocab of word
vocab=list(word_to_vec_map.columns.tolist())
word_to_index={k: v for v, k in enumerate(vocab)}

In [63]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    emb_matrix =np.zeros((vocab_len,emb_dim))
    
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=True)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [64]:
def classifier_model(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings =embedding_layer(sentence_indices)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(.5)(X)
    # Propagate X through a Dense layer with 5 units
    X =Dense(6)(X)
    # Add a softmax activation
    X =Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model =Model(sentence_indices, X)
    
    return model

In [65]:
model = classifier_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 462)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 462, 300)          120000600 
_________________________________________________________________
lstm_3 (LSTM)                (None, 462, 128)          219648    
_________________________________________________________________
dropout_3 (Dropout)          (None, 462, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774 

In [66]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [67]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)

In [80]:
y_train_oh=y_train