In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,doc_id,text,class,h1,h2,h3,h4,h5
0,doc0,Last updated: 12 April 2020,LAST_UPDATED_DATE,Drupal.org Privacy Policy,,,,
1,doc0,We value your privacy and strive to protect yo...,UNKNOWN,Drupal.org Privacy Policy,,,,
2,doc0,(2) [p] Disclaimer: This summary is not itself...,UNKNOWN,Drupal.org Privacy Policy,,Human Readable Summary,,
3,doc0,Right to be Informed - A data subject has the ...,PERSONAL_DATA_RIGHTS,Drupal.org Privacy Policy,,Human Readable Summary,Rights of the Data Subject,
4,doc0,Right to Restrict Processing - A data subject ...,PERSONAL_DATA_RIGHTS,Drupal.org Privacy Policy,,Human Readable Summary,Rights of the Data Subject,


In [3]:
df.shape

(2348, 8)

In [4]:
df.isnull().sum()

doc_id       0
text         0
class        0
h1         574
h2        1360
h3        1337
h4        2043
h5        2252
dtype: int64

In [5]:
df['class'].value_counts()

UNKNOWN                            428
PERSONAL_DATA_USES                 271
THIRDPARTY_INFORMATION_SHARING     246
PERSONAL_DATA_WE_COLLECT           245
PERSONAL_DATA_RIGHTS               201
COOKIES_AND_TRACKING_TECHNOLOGY    192
JURISDICTION                       176
HOW_WE_COLLECT_PERSONAL_DATA       133
CONTACT                            124
SECURITY                           101
UPDATES_AND_NOTIFICATIONS           66
LAST_UPDATED_DATE                   50
CHILDREN                            50
MARKETING_OPT_OUT                   37
DO_NOT_TRACK                        28
Name: class, dtype: int64

In [6]:
df['text'] = df['text'].str.lower()
df['h1'] = df['h1'].str.lower()
df['h2'] = df['h2'].str.lower()
df['h3'] = df['h3'].str.lower()
df['h4'] = df['h4'].str.lower()
df['h5'] = df['h5'].str.lower()

In [7]:
df['text'] = df['text'].str.replace(r"(\(\d+\))", '')
df['text'] = df['text'].str.replace(r"(\[[a-zA-Z]+\])", '')
df['text'] = df['text'].str.replace(r"|", ' ')

In [8]:
df.head(2)

Unnamed: 0,doc_id,text,class,h1,h2,h3,h4,h5
0,doc0,last updated: 12 april 2020,LAST_UPDATED_DATE,drupal.org privacy policy,,,,
1,doc0,we value your privacy and strive to protect yo...,UNKNOWN,drupal.org privacy policy,,,,


In [43]:
from keras.layers import  Dropout, Dense, Embedding, Flatten
from keras.models import Sequential
from keras.utils import to_categorical

In [17]:
all_text = df[['text', 'h1', 'h2', 'h3', 'h4', 'h5']].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

In [54]:
import re
import nltk
## for word embedding
import gensim
import gensim.downloader as gensim_api

In [57]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and   
    ## characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [59]:
from nltk.corpus import stopwords
lst_stopwords = stopwords.words('english')

In [61]:
df["text_clean"] = df["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
df.head()

Unnamed: 0,doc_id,text,class,h1,h2,h3,h4,h5,text_clean
0,doc0,last updated: 12 april 2020,LAST_UPDATED_DATE,drupal.org privacy policy,,,,,last updated 12 april 2020
1,doc0,we value your privacy and strive to protect yo...,UNKNOWN,drupal.org privacy policy,,,,,value privacy strive protect personal informat...
2,doc0,disclaimer: this summary is not itself a par...,UNKNOWN,drupal.org privacy policy,,human readable summary,,,disclaimer summary part privacy policy legal d...
3,doc0,right to be informed - a data subject has the ...,PERSONAL_DATA_RIGHTS,drupal.org privacy policy,,human readable summary,rights of the data subject,,right informed data subject right know whether...
4,doc0,right to restrict processing - a data subject ...,PERSONAL_DATA_RIGHTS,drupal.org privacy policy,,human readable summary,rights of the data subject,,right restrict processing data subject right r...


In [62]:
corpus = df["text_clean"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) 
               for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, 
                 delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], 
            delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

## fit w2v
nlp = gensim.models.word2vec.Word2Vec(lst_corpus, size=300,   
            window=8, min_count=1, sg=1, iter=30)

In [70]:
# word = "data"
# nlp[word]

In [86]:
from keras import preprocessing as kprocessing
from keras import backend as K

## tokenize text
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', 
                     oov_token="NaN", 
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lst_corpus)
dic_vocabulary = tokenizer.word_index
## create sequence
lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
## padding sequence
X = kprocessing.sequence.pad_sequences(lst_text2seq, 
                    maxlen=300, padding="post", truncating="post")
X.shape

(2348, 300)

In [98]:
## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, 300))

for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  nlp[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

  import sys


In [99]:
# from keras.preprocessing.text import one_hot

# vocab_size = 500
# X = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ') for d in all_text]


In [100]:
# from keras.preprocessing.sequence import pad_sequences

# max_length = 300
# X = pad_sequences(X, maxlen=max_length, padding='post')

In [101]:
y = df['class']

In [102]:
# classes one hot
n_classes = y.nunique()

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_vec = label_encoder.fit_transform(y)

y = to_categorical(y_vec, num_classes=n_classes)
y.shape

(2348, 15)

In [103]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
len(X_train), len(X_test), len(y_train), len(y_test)

(1573, 775, 1573, 775)

In [104]:
# # shape = X_train.shape[1]
# shape =  vocab_size

In [105]:
# model = Sequential()

# model.add(Embedding(vocab_size, 8, input_length=max_length))
# model.add(Flatten())
# model.add(Dropout(0.5))
# model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))


# model.add(Dense(n_classes, activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])


In [106]:
# model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, verbose=1)

# predicted = model.predict_classes(X_test)

In [115]:
from keras import layers, models

## code attention layer
def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

## input
x_in = layers.Input(shape=(300,))
## embedding
x = layers.Embedding(input_dim=embeddings.shape[0],  
                     output_dim=embeddings.shape[1], 
                     weights=[embeddings],
                     input_length=300, trainable=False)(x_in)
## apply attention
x = attention_layer(x, neurons=300)
## 2 layers of bidirectional lstm
x = layers.Bidirectional(layers.LSTM(units=300, dropout=0.2, 
                         return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(units=300, dropout=0.2))(x)
## final dense layers
x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(15, activation='softmax')(x)
## compile
model = models.Model(x_in, y_out)
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 300, 300)     1454100     input_7[0][0]                    
__________________________________________________________________________________________________
permute_6 (Permute)             (None, 300, 300)     0           embedding_12[0][0]               
__________________________________________________________________________________________________
dense_33 (Dense)                (None, 300, 300)     90300       permute_6[0][0]                  
__________________________________________________________________________________________________
attention 

In [116]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, verbose=1)

Train on 1573 samples, validate on 775 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100


KeyboardInterrupt: 