# Text Processing on Haptik Dataset
#### Firstly We will import all the packages required for all the processing required like data cleaning, 
#### data prerocessing,building model

In [188]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

#Data Cleaning packages
from pandas import read_csv, Series
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
%matplotlib inline

In [189]:
#timeit Decorator
import time                                                

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [190]:
@timeit
def load_dataset(filepath):
    dataset = read_csv(filepath, encoding = 'UTF-8' )
    
    return dataset

train = load_dataset('./train_data.csv')

test = load_dataset('./test_data.csv') 

'load_dataset'  74.39 ms
'load_dataset'  14.64 ms


In [191]:
#separating feature and target variable
@timeit
def feature_target(df):
    feature = df.iloc[:,0]
    target = df.iloc[:,1:]
    return feature, target

X_train, y_train = feature_target(train)
X_test, y_test = feature_target(test)


'feature_target'  0.67 ms
'feature_target'  0.51 ms


In [192]:
y_test

Unnamed: 0,food,recharge,support,reminders,travel,nearby,movies,casual,other
0,F,F,F,F,F,T,F,F,F
1,F,F,F,F,T,F,F,F,F
2,F,F,F,F,F,F,F,F,T
3,T,F,F,F,F,F,F,F,F
4,F,F,F,F,T,F,F,F,F
5,F,F,F,T,F,F,F,F,F
6,F,F,F,F,F,T,F,F,F
7,T,F,F,F,F,F,F,F,F
8,F,F,F,F,F,F,T,F,F
9,F,F,F,T,F,F,F,F,F


In [193]:
# One hot encoding for changing the categorical target variable to binary target variable(numerical)
#do this on both train and test target variable i.e y_train ,y_test
@timeit
def encode(target):
    target = target.astype(str).replace({'T':1, 'F':0}, axis =1)
    target = target.idxmax(axis = 1)
    return target

target = encode(y_train)
test_target = encode(y_test)



'encode'  388.85 ms
'encode'  95.21 ms


In [194]:
#label Encoding for creating the array of labeled target variable
@timeit
def labelEncode(y_train):
    lab_en = LabelEncoder()
    return (lab_en.fit_transform(y_train))
    

y_train = labelEncode(target)
y_train

'labelEncode'  35.91 ms


array([6, 1, 7, ..., 4, 0, 0])

#### now we will work on train_feature dataframe


In [195]:
X_train.head(10)

0                                      7am everyday
1                                    chocolate cake
2    closed mortice and tenon joint door dimentions
3                               train eppo kelambum
4      yesterday i have cancelled the flight ticket
5                          chamge it to 12pm to 9pm
6                        i want too going rajasthan
7                                              room
8            can you please arrange flight tickets?
9                             what kind of reminder
Name: message, dtype: object

In [196]:
type(X_train)

pandas.core.series.Series

In [197]:
#Tokenize

def tokenizer(X_train):
    tokenize = TreebankWordTokenizer()
    X_train = X_train.apply(lambda row: row.lower())
    X_train = X_train.apply(lambda row: tokenize.tokenize(row))
    return X_train

In [198]:
X_train = tokenizer(X_train)


In [199]:
X_train.head()

0                                      [7am, everyday]
1                                    [chocolate, cake]
2    [closed, mortice, and, tenon, joint, door, dim...
3                              [train, eppo, kelambum]
4    [yesterday, i, have, cancelled, the, flight, t...
Name: message, dtype: object

In [200]:
#stop word removal 
def remove_stopWord(X_train):
    #create english stop word list
    en_stop = get_stop_words('en')
    X_train = X_train.apply(lambda row: [i for i in row if i not in en_stop])
    return X_train

In [201]:
X_train = remove_stopWord(X_train)
X_train

0                                          [7am, everyday]
1                                        [chocolate, cake]
2        [closed, mortice, tenon, joint, door, dimentions]
3                                  [train, eppo, kelambum]
4                   [yesterday, cancelled, flight, ticket]
5                                      [chamge, 12pm, 9pm]
6                                 [want, going, rajasthan]
7                                                   [room]
8               [can, please, arrange, flight, tickets, ?]
9                                         [kind, reminder]
10                                 [jamshedpur, jharkhand]
11                                     [noidaa, secot, 44]
12                                      [flight, spicejet]
13                                                  [uber]
14                                                [3.3.17]
15                                            [fare, high]
16                   [know, train, running, jalgaon, pun

In [202]:
#stemming
def stemming(X_train):
    stemmer = PorterStemmer()
    X_train = X_train.apply(lambda row:[stemmer.stem(word) for word in row])
    return X_train

In [203]:
X_train = stemming(X_train)
X_train.head(10)

0                                [7am, everyday]
1                                 [chocol, cake]
2    [close, mortic, tenon, joint, door, diment]
3                        [train, eppo, kelambum]
4            [yesterday, cancel, flight, ticket]
5                             [chamg, 12pm, 9pm]
6                          [want, go, rajasthan]
7                                         [room]
8        [can, pleas, arrang, flight, ticket, ?]
9                                 [kind, remind]
Name: message, dtype: object

In [204]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
tk = TreebankWordTokenizer()
en_stop = get_stop_words('en')
vect = CountVectorizer(stop_words = en_stop, ngram_range = (1, 2), tokenizer = tk.tokenize, min_df = 2, max_df = 0.5)
vect.fit(X_train)
train_dtm = vect.transform(X_train)
train_dtm
test_dtm = vect.transform(X_test)
test_dtm

AttributeError: 'list' object has no attribute 'lower'

In [179]:
nb.fit(X_train, y_train)

ValueError: setting an array element with a sequence.