In [75]:
import pandas as pd

In [None]:
# Download glove-wiki-gigaword-100 dimension pre-trained vectors.

import gensim.downloader as api
# model = api.load("glove-wiki-gigaword-50")
model = api.load("glove-wiki-gigaword-100")

In [76]:
# Load the CSV data
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,doc_id,text,class,h1,h2,h3,h4,h5
0,doc0,Last updated: 12 April 2020,LAST_UPDATED_DATE,Drupal.org Privacy Policy,,,,
1,doc0,We value your privacy and strive to protect yo...,UNKNOWN,Drupal.org Privacy Policy,,,,
2,doc0,(2) [p] Disclaimer: This summary is not itself...,UNKNOWN,Drupal.org Privacy Policy,,Human Readable Summary,,
3,doc0,Right to be Informed - A data subject has the ...,PERSONAL_DATA_RIGHTS,Drupal.org Privacy Policy,,Human Readable Summary,Rights of the Data Subject,
4,doc0,Right to Restrict Processing - A data subject ...,PERSONAL_DATA_RIGHTS,Drupal.org Privacy Policy,,Human Readable Summary,Rights of the Data Subject,


In [77]:
df.shape

(2348, 8)

In [78]:
# Count the number of missing values.
df.isnull().sum()

doc_id       0
text         0
class        0
h1         574
h2        1360
h3        1337
h4        2043
h5        2252
dtype: int64

In [79]:
# Class-wise counts
df['class'].value_counts()

UNKNOWN                            428
PERSONAL_DATA_USES                 271
THIRDPARTY_INFORMATION_SHARING     246
PERSONAL_DATA_WE_COLLECT           245
PERSONAL_DATA_RIGHTS               201
COOKIES_AND_TRACKING_TECHNOLOGY    192
JURISDICTION                       176
HOW_WE_COLLECT_PERSONAL_DATA       133
CONTACT                            124
SECURITY                           101
UPDATES_AND_NOTIFICATIONS           66
LAST_UPDATED_DATE                   50
CHILDREN                            50
MARKETING_OPT_OUT                   37
DO_NOT_TRACK                        28
Name: class, dtype: int64

In [87]:
# Lower all the text
df['text'] = df['text'].str.lower()
df['h1'] = df['h1'].str.lower()
df['h2'] = df['h2'].str.lower()
df['h3'] = df['h3'].str.lower()
df['h4'] = df['h4'].str.lower()
df['h5'] = df['h5'].str.lower()

In [88]:
df['text'] = df['text'].str.replace(r"(\(\d+\))", '') # Remove information such as (10), (30)
df['text'] = df['text'].str.replace(r"(\[[a-zA-Z]+\])", '') # Remove information such as (p), (ul), etc.
df['text'] = df['text'].str.replace(r"|", ' ') # Remove | char

In [89]:
df.head(2)

Unnamed: 0,doc_id,text,class,h1,h2,h3,h4,h5,text_clean
0,doc0,last updated: 12 april 2020,LAST_UPDATED_DATE,drupal.org privacy policy,,,,,last updated 12 april 2020
1,doc0,we value your privacy and strive to protect yo...,UNKNOWN,drupal.org privacy policy,,,,,value privacy strive protect personal informat...


In [90]:
# Remove stop words for the text column and apply lemmatization

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [91]:
# Import stopwords list and apply over the text column

import re, nltk
from nltk.corpus import stopwords

lst_stopwords = stopwords.words('english')

df["text_clean"] = df["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
df.head()

Unnamed: 0,doc_id,text,class,h1,h2,h3,h4,h5,text_clean
0,doc0,last updated: 12 april 2020,LAST_UPDATED_DATE,drupal.org privacy policy,,,,,last updated 12 april 2020
1,doc0,we value your privacy and strive to protect yo...,UNKNOWN,drupal.org privacy policy,,,,,value privacy strive protect personal informat...
2,doc0,disclaimer: this summary is not itself a par...,UNKNOWN,drupal.org privacy policy,,human readable summary,,,disclaimer summary part privacy policy legal d...
3,doc0,right to be informed - a data subject has the ...,PERSONAL_DATA_RIGHTS,drupal.org privacy policy,,human readable summary,rights of the data subject,,right informed data subject right know whether...
4,doc0,right to restrict processing - a data subject ...,PERSONAL_DATA_RIGHTS,drupal.org privacy policy,,human readable summary,rights of the data subject,,right restrict processing data subject right r...


In [223]:
# Get vectors of each token in the document and average out all the vectors for every document.

dimension = 100
text_vectors = []
for t in df['text_clean']:
    text_split = t.split()
    vectors = np.array([model[x] for x in text_split if x in model])
    vec = np.mean(vectors, axis=0)
    if np.isnan(vec).any():
        text_vectors.append(list(np.zeros(dimension)))
    else:
        text_vectors.append(list(vec))

In [224]:
# convert it to csr matrix

import numpy as np
from scipy import sparse

text_vectors = sparse.csr_matrix(np.array(text_vectors))

In [225]:
# Apply TFIDF over cleaned text

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
text_vector = vectorizer.fit_transform(df['text_clean'])
len(vectorizer.get_feature_names()), text_vector.shape

(49643, (2348, 49643))

In [226]:
# Apply dimension reduction on cleaned text
y = df['class']

from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)
text_vector = pca.fit_transform(text_vector.toarray(), y)
text_vector.shape

In [228]:
# Merge all heading columns into single column(to avoid NULL values.)
heading = df[['h1', 'h2', 'h3', 'h4', 'h5']].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

In [229]:
# Apply TFIDF over merged heading column

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
heading_vector = vectorizer.fit_transform(heading)
len(vectorizer.get_feature_names()), heading_vector.shape

(4220, (2348, 4220))

In [230]:
from scipy.sparse import hstack

X = hstack((text_vector, heading_vector, text_vectors)) #.toarray()
y = df['class']

In [233]:
# Train and test split (80 - 20 ratio)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [281]:
# model training
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

clf = LinearSVC(C=1.25, penalty='l1', loss='squared_hinge', tol=0.1, dual=False, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

print('\nTraining Accuracy: ')
train_acc = clf.score(X_train, y_train) * 100
print(train_acc)

print('\nTest Accuracy: ')
test_acc = clf.score(X_test, y_test) * 100
print(test_acc)


Training Accuracy: 
89.24387646432375

Test Accuracy: 
71.91489361702128


In [282]:
# Model evaluation on validation data set.

from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

                                 precision    recall  f1-score   support

                       CHILDREN       0.91      1.00      0.95        10
                        CONTACT       0.82      0.92      0.87        25
COOKIES_AND_TRACKING_TECHNOLOGY       0.72      0.79      0.76        39
                   DO_NOT_TRACK       0.43      0.50      0.46         6
   HOW_WE_COLLECT_PERSONAL_DATA       0.19      0.19      0.19        27
                   JURISDICTION       0.78      0.83      0.81        35
              LAST_UPDATED_DATE       1.00      1.00      1.00        10
              MARKETING_OPT_OUT       0.44      1.00      0.61         7
           PERSONAL_DATA_RIGHTS       0.78      0.72      0.75        40
             PERSONAL_DATA_USES       0.76      0.69      0.72        54
       PERSONAL_DATA_WE_COLLECT       0.50      0.49      0.49        49
                       SECURITY       0.94      0.85      0.89        20
 THIRDPARTY_INFORMATION_SHARING       0.73      0.