# NAMAN GOEL NLP SUBMISSION

## Importing Libraries for NLP and LSTM

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from string import punctuation
from nltk.stem import SnowballStemmer
import re


Using TensorFlow backend.


## importing default stopwords

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Naman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Visualizing the provided data using pandas

### training data

In [6]:
train_data = pd.read_csv('Subtask-A-master/Subtask-A-master/V1.4_Training.csv',header=None,names=['Id','Text','Classification'])
train_data.head()

Unnamed: 0,Id,Text,Classification
0,663_3,"""Please enable removing language code from the...",1
1,663_4,"""Note: in your .csproj file, there is a Suppor...",0
2,664_1,"""Wich means the new version not fully replaced...",0
3,664_2,"""Some of my users will still receive the old x...",0
4,664_3,"""The store randomly gives the old xap or the n...",0


### Evaluation data

In [7]:
eval_data = pd.read_csv('Subtask-A-master/Subtask-A-master/SubtaskA_EvaluationData.csv',header=None)
eval_data.head()

Unnamed: 0,0,1,2
0,9566,This would enable live traffic aware apps.,X
1,9569,Please try other formatting like bold italics ...,X
2,9576,Since computers were invented to save time I s...,X
3,9577,Allow rearranging if the user wants to change ...,X
4,9579,Add SIMD instructions for better use of ARM NE...,X


## Testing data

In [8]:
test_data = pd.read_csv('Subtask-A-master/Subtask-A-master/SubtaskA_Trial_Test_Labeled.csv',encoding='ISO-8859-1', engine='python' )
test_data.head()

Unnamed: 0,id,sentence,label
0,1310_1,I'm not asking Microsoft to Gives permission l...,1
1,1312_1,somewhere between Android and iPhone.,0
2,1313_1,And in the Windows Store you can flag the App ...,0
3,1313_2,"Many thanks Sameh Hi, As we know, there is a l...",0
4,1313_3,The idea is that we can develop a regular app ...,1


## Function to clean the data using NLP

In [9]:
def clean_text(text):
    
    text = text.translate(string.punctuation)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

## Updating the trained data with clean text

In [12]:
train_data['Text'] = train_data['Text'].map(lambda x: clean_text(x))

## Renaming the Columns

In [14]:
train_data = train_data.astype({'Classification': str})
train_data = train_data[train_data['Classification'].map(len)<2]

In [16]:
train_data.head()

Unnamed: 0,Id,Text,Classification
0,663_3,pleas enabl remov languag code dev center lang...,1
1,663_4,note : csproj file supportedcultur entri like ...,0
2,664_1,wich mean new version fulli replac old version...,0
3,664_2,some user still receiv old xap version app,0
4,664_3,the store random give old xap new xap version app,0


## Only Text and Classification is required

In [17]:
train_data = train_data.iloc[:,1:3]
train_data.head()

Unnamed: 0,Text,Classification
0,pleas enabl remov languag code dev center lang...,1
1,note : csproj file supportedcultur entri like ...,0
2,wich mean new version fulli replac old version...,0
3,some user still receiv old xap version app,0
4,the store random give old xap new xap version app,0


## Prepairing data for LSTM model 

In [18]:
vocabulary_size = 20500
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(train_data['Text'])

sequences = tokenizer.texts_to_sequences(train_data['Text'])
data = pad_sequences(sequences, maxlen=150)

## Function to create model

In [21]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=150))
    model_conv.add(Dropout(0.5))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(115))
    model_conv.add(Dense(1, activation='relu'))
    
    
    model_conv.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])
    return model_conv

## Running the function for 10 epochs giving 91.26 accuracy

In [22]:
labels = train_data['Classification']

model_conv = create_conv_model()
model_conv.fit(data,np.array(labels),validation_split=0.15,epochs=10)

Train on 7012 samples, validate on 1488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f181242240>

## Using the trained model for evaluation data

In [31]:
evaluation = eval_data[1]
sol = tokenizer.texts_to_sequences(evaluation)
final = pad_sequences(sol, maxlen=150)

In [33]:
eval_sol = model_conv.predict_classes(final)

In [40]:
eval_data_2=eval_data.drop([2],axis=1)

In [43]:
eval_data_2['label']=eval_sol

In [45]:
eval_data_2.head()

Unnamed: 0,0,1,label
0,9566,This would enable live traffic aware apps.,0
1,9569,Please try other formatting like bold italics ...,0
2,9576,Since computers were invented to save time I s...,0
3,9577,Allow rearranging if the user wants to change ...,0
4,9579,Add SIMD instructions for better use of ARM NE...,0


## saving the evaluation file

In [47]:
eval_data_2.to_csv('naman_goel.csv')