In [None]:
import pandas as pd
import os
import re
import spacy
from gensim.models.phrases import Phrases, Phraser
from time import time 
import multiprocessing
from gensim.models import Word2Vec
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import scale
import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv('corpus.csv')

In [None]:
df = df.iloc[:, [1,2]]

In [None]:
df_clean = df
from nltk.tokenize import RegexpTokenizer
t = time()


tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['messages'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)
# delete Stop Words

print('Time to tokenize everything: {} mins'.format(round((time() - t) / 60, 2)))
df_clean.head()

Time to tokenize everything: 0.03 mins


Unnamed: 0,messages,target,clean,tokens
0,cam babe look look companion friendship love d...,1,cam babe look look companion friendship love d...,"[cam, babe, look, look, companion, friendship,..."
1,want make money order confirm order ship janua...,1,want make money order confirm order ship janua...,"[want, make, money, order, confirm, order, shi..."
2,food thought join take free tour click remov,1,food thought join take free tour click remov,"[food, thought, join, take, free, tour, click,..."
3,pharmaci ta would want cheap perscript http ww...,1,pharmaci ta would want cheap perscript http ww...,"[pharmaci, ta, would, want, cheap, perscript, ..."
4,bigger breast pill imag load cli k info ship s...,1,bigger breast pill imag load cli k info ship s...,"[bigger, breast, pill, imag, load, cli, k, inf..."


In [None]:

#WORD2VEC()
cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

#BUILD_VOCAB()
t = time()
w2v_model.build_vocab(df_clean["tokens"], progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))


Time to build vocab: 0.08 mins


In [None]:
y = df['target'].values
X = np.array(df_clean["tokens"])

#And here is the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in X_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

vocab size : 18891




In [None]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
train_vecs_w2v = np.concatenate([buildWordVector(z, 300) for z in map(lambda x: x, X_train)])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 300) for z in map(lambda x: x, X_test)])
test_vecs_w2v = scale(test_vecs_w2v)

print ('shape for training set : ',train_vecs_w2v.shape,
      '\nshape for test set : ', test_vecs_w2v.shape)

  
  


shape for training set :  (41236, 300) 
shape for test set :  (10310, 300)


ANN

In [None]:
model = Sequential()

model.add(Dense(128, activation='leaky_relu', input_dim=300))
model.add(Dense(64, activation='leaky_relu'))
model.add(Dense(32, activation='leaky_relu'))
model.add(Dropout(0.6))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               38528     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 48,897
Trainable params: 48,897
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_vecs_w2v, y_train, epochs=50, batch_size=50,
                   validation_data=(test_vecs_w2v,y_test))
loss, accuracy = model.evaluate(train_vecs_w2v, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_vecs_w2v, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training Accuracy: 0.7217
Testing Accuracy:  0.7194


# TEST

In [45]:
test = pd.read_csv('test.csv')

In [46]:
id = test['Id']

In [47]:
test = test[['Subject','Body']]

In [48]:
test['messages'] = test['Subject']+' '+test['Body']

In [49]:
import re
import nltk
import ssl
##ssl error aaya toh
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [50]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [51]:
ps = PorterStemmer()
test_corpus = []

In [52]:
for i in range(len(test)):
    review = re.sub('[^a-zA-Z]',' ',str(test['messages'][i]))
    review = review.lower().split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    test_corpus.append(review)

## Creation of submission,csv from here

In [55]:
f_test = pd.DataFrame()

In [56]:
f_test['Body'] = test_corpus

In [57]:
df_clean_f = f_test
from nltk.tokenize import RegexpTokenizer
t = time()


tokenizer = RegexpTokenizer(r'\w+')
df_clean_f['clean'] = df_clean_f['Body'].astype('str') 
df_clean_f.dtypes

df_clean_f["tokens"] = df_clean_f["clean"].apply(tokenizer.tokenize)
# delete Stop Words

print('Time to tokenize everything: {} mins'.format(round((time() - t) / 60, 2)))
df_clean_f.head()

Time to tokenize everything: 0.01 mins


Unnamed: 0,Body,clean,tokens
0,ext decemb meet request thank sent hold time c...,ext decemb meet request thank sent hold time c...,"[ext, decemb, meet, request, thank, sent, hold..."
1,ccac sale ee joint technolog vision session da...,ccac sale ee joint technolog vision session da...,"[ccac, sale, ee, joint, technolog, vision, ses..."
2,cancel uudex discuss microsoft team meet join ...,cancel uudex discuss microsoft team meet join ...,"[cancel, uudex, discuss, microsoft, team, meet..."
3,check hannah microsoft team meet join comput m...,check hannah microsoft team meet join comput m...,"[check, hannah, microsoft, team, meet, join, c..."
4,hold ccac sale convers cybersecur one confirm ...,hold ccac sale convers cybersecur one confirm ...,"[hold, ccac, sale, convers, cybersecur, one, c..."


In [58]:
#WORD2VEC()
cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

#BUILD_VOCAB()
t = time()
w2v_model.build_vocab(df_clean_f["tokens"], progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))


Time to build vocab: 0.02 mins


In [59]:
df_clean_f.columns

Index(['Body', 'clean', 'tokens'], dtype='object')

In [60]:
X_f = np.array(df_clean_f["tokens"])

In [61]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in X_f])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

vocab size : 4843




In [62]:
train_vecs_w2v_f = np.concatenate([buildWordVector(z, 300) for z in map(lambda x: x, X_f)])
train_vecs_w2v_f = scale(train_vecs_w2v_f)

print ('shape for training set : ',train_vecs_w2v_f.shape)

  


shape for training set :  (4898, 300)


In [63]:
test_prob = model.predict(train_vecs_w2v_f)

In [65]:
test_prob = test_prob.flatten()

In [66]:
submission = pd.DataFrame()

In [67]:
submission['Id'] = id

In [68]:
submission['Flag'] = test_prob

In [69]:
submission = submission.set_index('Id')

In [70]:
submission.to_csv('submission.csv')