[text_processing_kaggle](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings)

[Text_data_preprocessing](https://www.kaggle.com/shashanksai/text-preprocessing-using-python)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [29]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [3]:
ls_class = []
ls_class_defn = []
ls_q = []
with open('5_label.txt','r',encoding="utf-8") as fin:
    for line in fin.readlines():
        ww = line.replace('\n','').split(' ')
        tt = ww[0].split(':')
        ls_class.append(tt[0])
        ls_class_defn.append(tt[1])
        ls_q.append(" ".join(ww[1:]))
        
df = pd.DataFrame({'LABEL': ls_class, 'LABEL_DEFINITION': ls_class_defn, 'QUESTION': ls_q})

In [4]:
len(df.LABEL.unique()), df.LABEL.unique()

(6, array(['DESC', 'ENTY', 'ABBR', 'HUM', 'NUM', 'LOC'], dtype=object))

In [5]:
len(df.LABEL_DEFINITION.unique()), df.LABEL_DEFINITION.unique()

(47, array(['manner', 'cremat', 'animal', 'exp', 'ind', 'gr', 'title', 'def',
        'date', 'reason', 'event', 'state', 'desc', 'count', 'other',
        'letter', 'religion', 'food', 'country', 'color', 'termeq', 'city',
        'body', 'dismed', 'mount', 'money', 'product', 'period',
        'substance', 'sport', 'plant', 'techmeth', 'volsize', 'instru',
        'abb', 'speed', 'word', 'lang', 'perc', 'code', 'dist', 'temp',
        'symbol', 'ord', 'veh', 'weight', 'currency'], dtype=object))

In [6]:
df.shape

(5452, 3)

In [7]:
df.head()

Unnamed: 0,LABEL,LABEL_DEFINITION,QUESTION
0,DESC,manner,How did serfdom develop in and then leave Russ...
1,ENTY,cremat,What films featured the character Popeye Doyle ?
2,DESC,manner,How can I find a list of celebrities ' real na...
3,ENTY,animal,What fowl grabs the spotlight after the Chines...
4,ABBR,exp,What is the full form of .com ?


In [8]:
df.groupby(by=['LABEL']).count().sort_values(by=['QUESTION'], ascending=False)

Unnamed: 0_level_0,LABEL_DEFINITION,QUESTION
LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1
ENTY,1250,1250
HUM,1223,1223
DESC,1162,1162
NUM,896,896
LOC,835,835
ABBR,86,86


In [9]:
ls_sent = df.QUESTION.values.tolist()

In [10]:
import numpy as np
np.average([len(sent.split(' ')) for sent in ls_sent])

10.204512105649304

In [11]:
#df.groupby(by=['LABEL','LABEL_DEFINITION']).count().sort_values(by=['LABEL','QUESTION'], ascending=False)

- Tokenize the text column and convert them to vector sequences
- Pad the sequence as needed 
  - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or 
  - if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [12]:
df_train = df.drop(columns=['LABEL_DEFINITION'])

In [13]:
df_train.shape

(5452, 2)

In [14]:
## split to train and val
train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=2019)

In [15]:
train_df.shape, val_df.shape

((4361, 2), (1091, 2))

In [16]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 40 # max number of words in a question to use

In [19]:
## fill up the missing values
train_X_values = train_df["QUESTION"].fillna("_na_").values
val_X_values = val_df["QUESTION"].fillna("_na_").values

In [30]:

tok = Tokenizer()
tok.fit_on_texts(["this comment is not toxic"]) 
print(tok.texts_to_sequences(["this comment is not toxic"])) 
print(tok.texts_to_sequences(["this very long comment is not toxic"]))

[[1, 2, 3, 4, 5]]
[[1, 2, 3, 4, 5]]


In [26]:
## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X_values))
train_X_tok = tokenizer.texts_to_sequences(train_X_values)
val_X_tok = tokenizer.texts_to_sequences(val_X_values)

In [27]:
train_X_values

array(['How can you become an FBI agent ?',
       'What is capitalism according to Max Weber ?',
       'Where can I find all the information I need to know about the English Civil War , 1642-1649 , ?',
       ..., 'Where can I get mailing lists ?',
       'What is a fear of jealousy ?',
       'How many equal sides are there on a scalene triangle ?'],
      dtype=object)

In [31]:
## Pad the sentences 
train_X = pad_sequences(train_X_tok, maxlen=maxlen)
val_X = pad_sequences(val_X_tok, maxlen=maxlen)

In [36]:
train_X_tok[:2]

[[7, 26, 24, 167, 35, 1834, 1835], [2, 3, 2973, 1836, 11, 2974, 2975]]

In [33]:
train_X

array([[   0,    0,    0, ...,   35, 1834, 1835],
       [   0,    0,    0, ...,   11, 2974, 2975],
       [   0,    0,    0, ...,   61, 1837, 1838],
       ...,
       [   0,    0,    0, ...,   48, 7401, 7402],
       [   0,    0,    0, ...,   66,    4, 7403],
       [   0,    0,    0, ...,    6, 7404, 1225]], dtype=int32)

In [37]:
## Get the target values
train_y = train_df['LABEL'].values
val_y = val_df['LABEL'].values

In [38]:
train_y

array(['DESC', 'DESC', 'LOC', ..., 'LOC', 'ENTY', 'NUM'], dtype=object)