# Text Processing on Haptik Dataset
#### Firstly we will import all the packages required for all the processing required like data cleaning, 
#### data prerocessing, building model

In [135]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

#Data Cleaning packages
from pandas import read_csv, Series
from numpy import array
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TreebankWordTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from wordcloud import WordCloud
%matplotlib inline

#Time
import time   

#Model Fitting
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [136]:
#timeit Decorator
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [137]:
@timeit
def load_dataset(filepath):
    dataset = read_csv(filepath, encoding = 'UTF-8')
    
    return dataset

train = load_dataset('./train_data.csv')

test = load_dataset('./test_data.csv') 

'load_dataset'  64.94 ms
'load_dataset'  15.79 ms


In [138]:
#separating feature and target variable
@timeit
def feature_target(df):
    feature = df.iloc[:,0]
    target = df.iloc[:,1:]
    return feature, target

X_train, y_train = feature_target(train)
X_test, y_test = feature_target(test)
# X is the Feature while y is the Label

'feature_target'  0.73 ms
'feature_target'  0.73 ms


In [139]:
X_test.head()

0                                Nearest metro station
1                    Pick up n drop service trough cab
2                                I wants to buy a bick
3                                        Show me pizza
4    What is the cheapest package to andaman and ni...
Name: message, dtype: object

In [140]:
# One hot encoding for changing the categorical target variable to binary target variable(numerical)
# do this on both train and test target variable i.e y_train, y_test
@timeit
def encode(target):
    target = target.astype(str).replace({'T':1, 'F':0}, axis=1)
    target = target.idxmax(axis = 1)
    return target

target = encode(y_train)
test_target = encode(y_test)



'encode'  424.63 ms
'encode'  110.43 ms


In [141]:
#label Encoding for creating the array of labeled target variable
@timeit
def labelEncode(y):
    lab_en = LabelEncoder()
    return (lab_en.fit_transform(y))
    

y_train = labelEncode(target)
y_test = labelEncode(test_target)

'labelEncode'  38.49 ms
'labelEncode'  8.44 ms


In [142]:
y_test

array([3, 8, 4, ..., 0, 4, 0])

#### now we will work on train_feature dataframe


In [143]:
X_train.head(10)

0                                      7am everyday
1                                    chocolate cake
2    closed mortice and tenon joint door dimentions
3                               train eppo kelambum
4      yesterday i have cancelled the flight ticket
5                          chamge it to 12pm to 9pm
6                        i want too going rajasthan
7                                              room
8            can you please arrange flight tickets?
9                             what kind of reminder
Name: message, dtype: object

In [144]:
type(X_train)

pandas.core.series.Series

In [145]:
#Tokenize
@timeit
def tokenizer(X):
    tokenize = TreebankWordTokenizer()
    X = X.apply(lambda row: row.lower())
    X = X.apply(lambda row: tokenize.tokenize(row))
    return tokenize, X

In [146]:
tokenize, X_train = tokenizer(X_train)
tokenize

'tokenizer'  5487.07 ms


<nltk.tokenize.treebank.TreebankWordTokenizer at 0x7fbb42ca5950>

In [147]:
X_train.head()

0                                      [7am, everyday]
1                                    [chocolate, cake]
2    [closed, mortice, and, tenon, joint, door, dim...
3                              [train, eppo, kelambum]
4    [yesterday, i, have, cancelled, the, flight, t...
Name: message, dtype: object

In [148]:
#stop word removal 
@timeit
def remove_stopWord(X):
    #create english stop word list
    en_stop = get_stop_words('en')
    X = X.apply(lambda row: [i for i in row if i not in en_stop])
    return en_stop, X

In [149]:
en_stop, X_train = remove_stopWord(X_train)
X_train.head(10)

'remove_stopWord'  1269.60 ms


0                                      [7am, everyday]
1                                    [chocolate, cake]
2    [closed, mortice, tenon, joint, door, dimentions]
3                              [train, eppo, kelambum]
4               [yesterday, cancelled, flight, ticket]
5                                  [chamge, 12pm, 9pm]
6                             [want, going, rajasthan]
7                                               [room]
8           [can, please, arrange, flight, tickets, ?]
9                                     [kind, reminder]
Name: message, dtype: object

In [150]:
en_stop

[u'a',
 u'about',
 u'above',
 u'after',
 u'again',
 u'against',
 u'all',
 u'am',
 u'an',
 u'and',
 u'any',
 u'are',
 u"aren't",
 u'as',
 u'at',
 u'be',
 u'because',
 u'been',
 u'before',
 u'being',
 u'below',
 u'between',
 u'both',
 u'but',
 u'by',
 u"can't",
 u'cannot',
 u'could',
 u"couldn't",
 u'did',
 u"didn't",
 u'do',
 u'does',
 u"doesn't",
 u'doing',
 u"don't",
 u'down',
 u'during',
 u'each',
 u'few',
 u'for',
 u'from',
 u'further',
 u'had',
 u"hadn't",
 u'has',
 u"hasn't",
 u'have',
 u"haven't",
 u'having',
 u'he',
 u"he'd",
 u"he'll",
 u"he's",
 u'her',
 u'here',
 u"here's",
 u'hers',
 u'herself',
 u'him',
 u'himself',
 u'his',
 u'how',
 u"how's",
 u'i',
 u"i'd",
 u"i'll",
 u"i'm",
 u"i've",
 u'if',
 u'in',
 u'into',
 u'is',
 u"isn't",
 u'it',
 u"it's",
 u'its',
 u'itself',
 u"let's",
 u'me',
 u'more',
 u'most',
 u"mustn't",
 u'my',
 u'myself',
 u'no',
 u'nor',
 u'not',
 u'of',
 u'off',
 u'on',
 u'once',
 u'only',
 u'or',
 u'other',
 u'ought',
 u'our',
 u'ours',
 u'ourselves',

#### We can see that all the stopwords like 'i', 'have' etc have been removed from our tokenized list.

In [151]:
#stemming is done to keep the root word and remove the extra part of it.
@timeit
def stemming(X):
    stemmer = PorterStemmer()
    X = X_train.apply(lambda row:[stemmer.stem(word) for word in row])
    return stemmer, X

In [152]:
stemmer, X_train = stemming(X_train)
X_train.head(10)

'stemming'  5528.53 ms


0                                [7am, everyday]
1                                 [chocol, cake]
2    [close, mortic, tenon, joint, door, diment]
3                        [train, eppo, kelambum]
4            [yesterday, cancel, flight, ticket]
5                             [chamg, 12pm, 9pm]
6                          [want, go, rajasthan]
7                                         [room]
8        [can, pleas, arrang, flight, ticket, ?]
9                                 [kind, remind]
Name: message, dtype: object

In [153]:
stemmer

<PorterStemmer>

In [154]:
text = list(X_train)
text_string = []
for item in text:
    text_string.append(' '.join(item))
text_string

[u'7am everyday',
 u'chocol cake',
 u'close mortic tenon joint door diment',
 u'train eppo kelambum',
 u'yesterday cancel flight ticket',
 u'chamg 12pm 9pm',
 u'want go rajasthan',
 u'room',
 u'can pleas arrang flight ticket ?',
 u'kind remind',
 u'jamshedpur jharkhand',
 u'noidaa secot 44',
 u'flight spicejet',
 u'uber',
 u'3.3.17',
 u'fare high',
 u'know train run jalgaon pune',
 u'pl send current statu train ticket',
 u"ye 've got remind now",
 u'pleas wake today 6am',
 u'patli aligarh train week',
 u'can look flight option ?',
 u'thrursday remind ?',
 u'remind',
 u'payment made ola money',
 u'silchar guwahati',
 u'five peopl',
 u'like cancel wake call sun , 05 march 05:00 { reminder_list : 708936 , user_id : 1982962 , task_nam : wakeup , offset : 0 , api_nam : exotel }',
 u'hey , can find nearest medic store ? { task }',
 u'pleas detail train 12630',
 u'9761004710',
 u'inform',
 u'within one & half hour',
 u'cashback train book ?',
 u'need u contact',
 u'canara bank atm',
 u'like c

In [155]:
# Count Vectorizer
@timeit
def count_vector(train, test):
    vect = CountVectorizer(stop_words=en_stop,
                  ngram_range=(1, 2),
                  tokenizer=tokenize.tokenize,
                  min_df=2,
                  max_df=0.5)
    train_dtm = vect.fit_transform(train)
    test_dtm = vect.transform(test)
    return train_dtm, test_dtm

train_dtm, test_dtm = count_vector(text_string, X_test)

'count_vector'  8185.48 ms


In [156]:
train_dtm

<40659x22039 sparse matrix of type '<type 'numpy.int64'>'
	with 298595 stored elements in Compressed Sparse Row format>

In [157]:
test_dtm

<10000x22039 sparse matrix of type '<type 'numpy.int64'>'
	with 37477 stored elements in Compressed Sparse Row format>

In [165]:
@timeit
def Mnb(train1, train2, test1, test2):
    nb = MultinomialNB()
    nb.fit(train1, train2)
    prediction = nb.predict(test1)
#     print (prediction.shape)
#     print (test2.shape)
    return accuracy_score(test2, prediction)
    
Mnb(train_dtm, y_train, test_dtm, y_test)

'Mnb'  30.36 ms


0.6411

In [161]:
print (train_dtm.shape)
print (test_dtm.shape)

(40659, 22039)
(10000, 22039)


In [162]:
for i in [train_dtm, y_train, test_dtm, y_test]:
    print ( type(i), i.shape)

<class 'scipy.sparse.csr.csr_matrix'> (40659, 22039)
<type 'numpy.ndarray'> (40659,)
<class 'scipy.sparse.csr.csr_matrix'> (10000, 22039)
<type 'numpy.ndarray'> (10000,)


In [78]:
y_test.shape

(40659,)

In [80]:
prediction.shape

NameError: name 'prediction' is not defined