Importing Libraries

In [1]:
#Basic essential libraries
import re
import csv
import pandas as pd
import nltk
import os
import nltk
import numpy as np 
import pandas as pd 

#stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

#Sci-kit learn and N
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

#Keras (Tensorflow for CNN model generation and Evaluation)
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam


import warnings
warnings.filterwarnings("ignore")
#pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_colwidth', -1)

Reading files

In [2]:
trainfile = pd.read_csv("train_data.csv")
testfile = pd.read_csv("test_data.csv")
labelfile = pd.read_csv("train_label.csv")

Renaming temporary Dataframes

In [3]:
labelfile.head()
train_df = pd.DataFrame(columns=['ID','text'])
testing_df = pd.DataFrame(columns=['ID','text'])

Re-checking the data after transfer

In [4]:
train_df['ID'] = trainfile['trn_id']
train_df['text'] = trainfile['text']

testing_df['ID'] = testfile['test_id']
testing_df['text'] = testfile['text']

print(len(train_df))
print(len(testing_df))

650000
50000


Transferring the Labels for Training

In [5]:
trainlabel = labelfile['label']
len(trainlabel)

650000

Joining the Dataframes

In [6]:
frames = [train_df, testing_df]
combined = pd.concat(frames)

Length of Combined Dataframes

In [7]:
len(combined)

700000

Text Preprocessing

In [8]:
#stopwords
stop = stopwords.words('english')
#lowercase
combined['text']=combined['text'].str.lower()
#initialising the tokeniser object
tokenise = RegexpTokenizer(r'\w+')

Tokenising

In [9]:
combined['text']=combined['text'].apply(lambda x: tokenise.tokenize(x))

In [10]:
combined.shape

(700000, 2)

Removing Stopwords

In [12]:
combined['text']=combined['text'].apply(lambda x: [item for item in x if item not in stop])


Re-combining the words post processing

In [13]:
combined['text'] = combined['text'].apply(' '.join)


In [14]:
freq = pd.Series(' '.join(combined['text']).split()).value_counts()[:10]
freq

food       408071
good       379196
place      351781
like       293693
service    276443
get        266153
time       264204
one        263065
would      261263
great      246707
dtype: int64

Removing most frequent words

In [15]:
freq = list(freq.index)
combined['text'] = combined['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
combined.head()

Unnamed: 0,ID,text
0,trn_1,well got write first review yelp reviews figur...
1,trn_2,greek restaurant tasty tried chicken kabob ten...
2,trn_3,website says open google says open yelp says o...
3,trn_4,could give zero stars walked big lady asked ne...
4,trn_5,definitely excellent tried mochi mango flavore...


Removing Least Frequent Words

In [16]:
freq = pd.Series(' '.join(combined['text']).split()).value_counts()[-10:]
freq

experiensia    1
poopeyes       1
allrighty      1
arrrgggh       1
mogador        1
normak         1
thailands      1
nailbar        1
teeeeeeny      1
sanguinely     1
dtype: int64

In [17]:
freq = list(freq.index)
combined['text'] = combined['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

Splitting the combined DF to train and test

In [18]:
df_train = combined.iloc[:650000,:]
df_test = combined.iloc[650000:,:]

Transferring values into Series

In [24]:
train_text=df_train.text.values
test_text=df_test.text.values
target = labelfile.label
y = to_categorical(target)
print(train_text.shape,target.shape,y.shape,test_text.shape)

(650000,) (650000,) (650000, 6) (50000,)


Splitting training data into test and train for model evaluation 

In [25]:
X_train_text,X_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.2,stratify=y,random_state=123)
print(X_train_text.shape,y_train.shape)
print(X_val_text.shape,y_val.shape)

(520000,) (520000, 6)
(130000,) (130000, 6)


Number of Unique Words

In [28]:
all_words=' '.join(X_train_text)
all_words=word_tokenize(all_words)
dist=FreqDist(all_words)
num_unique_word=len(dist)
num_unique_word

175132

Maximum review length

In [30]:
r_len=[]
for text in X_train_text:
    word=word_tokenize(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

2005

Define the parameters for CNN model

In [31]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=6

Preparing text that can fit and be reused in multiple text documents. After fit, it is encoded for further use

In [32]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(test_text)

Lets convert the sequences into 2D array

In [33]:
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
print(X_train.shape,X_val.shape,X_test.shape)

(520000, 2005) (130000, 2005) (50000, 2005)


We build a CNN neural model with 1-dimension window. We have one hidden layer and compile the model using ADAM optimizer.

In [34]:
model2= Sequential()
model2.add(Embedding(max_features,100,input_length=max_words))
model2.add(Dropout(0.2))

model2.add(Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1))
model2.add(GlobalMaxPooling1D())

model2.add(Dense(128,activation='relu'))
model2.add(Dropout(0.2))

model2.add(Dense(num_classes,activation='softmax'))


model2.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

model2.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2005, 100)         17513200  
_________________________________________________________________
dropout_1 (Dropout)          (None, 2005, 100)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2005, 64)          19264     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_2 (Dropout)  

Execute the model to find the accuracy

In [35]:
%%time
history2=model2.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=epochs, batch_size=batch_size, verbose=1)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 520000 samples, validate on 130000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 3h 57min 5s


Predict the labels for test data

In [36]:
ypred = model2.predict_classes(X_test,verbose = 1)



Get column names 

In [37]:
df_test.columns

Index(['ID', 'text'], dtype='object')

Make final output dataframe

In [38]:
final = pd.DataFrame({"ID":df_test["ID"],"label":ypred})

Generate the output file

In [40]:
final.to_csv("output3.csv",index=False)