In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.svm import SVC

In [10]:
data = pd.read_csv('validation.csv',
                       encoding='utf-8', names=['text', 'class'], skiprows =1)

In [11]:
data.drop_duplicates(subset ="text",
                     keep = 'first', inplace = True)
data

Unnamed: 0,text,class
0,Sci-News.com does not knowingly collect or sol...,Privacy contact information
1,Submitting an Order When you submit an order ...,First Party Collection/Use
3,We use cookies to enhance the browsing and sho...,First Party Collection/Use
5,This privacy statement covers the site new.www...,Introductory/Generic
6,Information Collection and Use Information Co...,First Party Collection/Use
8,We also require our credit card transaction pr...,Data Security
9,This privacy statement covers the use of cook...,Third Party Sharing/Collection
11,Any comments or materials sent to Caribou Coff...,First Party Collection/Use
12,Caribou keeps your cell phone number private a...,Data Security
14,"Clear Gifs can """"work with"""" existing cookies ...",Introductory/Generic


In [12]:
dict = {'First Party Collection/Use': 0,
         'Third Party Sharing/Collection': 1,
         'User Choice/Control': 2,
         'Privacy contact information': 3,
         'Introductory/Generic': 4,
         'Practice not covered': 5,
         'Data Security': 6,
         'User Access, Edit and Deletion': 7,
         'Policy Change': 8,
         'Do Not Track': 9,
         'International and Specific Audiences': 10,
         'Data Retention': 11}

In [13]:
def replace_classes_with_numbers(text):
    return dict[text]

data['class'] =data['class'].apply(replace_classes_with_numbers)

In [14]:
data['text'] = data['text'].str.lower()

In [16]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.append('may') #???
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
print(data['text'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Milan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


0      sci-news.com knowingly collect solicit persona...
1      submitting order submit order ask name, email ...
3      use cookies enhance browsing shopping experien...
5      privacy statement covers site new.www.redorbit...
6      information collection use information collect...
8      also require credit card transaction processor...
9      privacy statement covers use cookies redorbit,...
11     comments materials sent caribou coffee site, i...
12     caribou keeps cell phone number private times....
14     clear gifs ""work with"" existing cookies comp...
15     d. except specified herein, policy also apply ...
16     aggregate information (non-personally identifi...
17     b. cookies. (i) hearst (or third party service...
19     share personal information third parties third...
20     e. location information. may, enable advertise...
22     f. third parties. receive information third pa...
23     b. combine use information collect either onli...
24     4. disclosure informatio

In [18]:
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

data['text'] = data['text'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Milan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [20]:
data.to_csv('preprocessed_validation.csv')

In [2]:
#from here SVM test
train_data = pd.read_csv('preprocessed.csv',
                       encoding='utf-8', names=['text', 'class'], skiprows =1)
validation = pd.read_csv('preprocessed_validation.csv',
                       encoding='utf-8', names=['text', 'class'], skiprows =1)

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

In [4]:
x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in train_data['text']]
x_validation_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in validation['text']]

In [5]:
import time

start = time.time()

model = gensim.models.Word2Vec(x_tokenized,
                 vector_size=100
                )
validation_model = gensim.models.Word2Vec(x_tokenized,
                 vector_size=100
                )

end = round(time.time()-start,2)
print("This process took",end,"seconds.")

This process took 1.07 seconds.


In [9]:
model.wv.most_similar("free")

[('call', 0.9989032745361328),
 ('day', 0.9984313249588013),
 ('opportunity', 0.9983790516853333),
 ('stop', 0.9982364177703857),
 ('account,', 0.9982260465621948),
 ('latin', 0.9981867671012878),
 ('opted', 0.9981570243835449),
 ('exchange', 0.9980195164680481),
 ('e-mail,', 0.9979535937309265),
 ('sending', 0.9978770017623901)]

In [6]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()

In [7]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = model.wv
             )

In [16]:
test_vec = sequencer.textToVector(validation['text'][0])
test_vec.shape

(1500,)

In [8]:
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])
x_vecs.shape

(2639, 1500)

In [9]:
x_validation_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_validation_tokenized])
x_validation_vecs.shape

(550, 1500)

In [10]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=50)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))



Sum of variance ratios:  0.9102913023207337


In [11]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape

(2639, 50)

In [12]:
x_validation_comps = pca_model.transform(x_validation_vecs)
x_validation_comps.shape

(550, 50)

In [13]:
start = time.time() 

svm_classifier = SVC()
svm_classifier.fit(x_comps, train_data['class'])

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

Support Vector Machine Classifier has fitted, this process took 0.61 seconds


In [14]:
svm_classifier.score(x_validation_comps, validation['class'])

0.49272727272727274

In [23]:
from sklearn.metrics import classification_report

prediction = svm_classifier.predict(x_validation_comps)
result = classification_report(validation['class'], prediction)
print(result)

              precision    recall  f1-score   support

           0       0.50      0.79      0.61       175
           1       0.48      0.57      0.53       127
           2       0.57      0.11      0.18        38
           3       0.53      0.40      0.46        20
           4       0.45      0.53      0.49        60
           5       0.00      0.00      0.00        16
           6       0.50      0.04      0.08        24
           7       0.00      0.00      0.00        22
           8       0.80      0.33      0.47        24
           9       0.00      0.00      0.00         4
          10       0.35      0.21      0.26        33
          11       0.00      0.00      0.00         7

    accuracy                           0.49       550
   macro avg       0.35      0.25      0.26       550
weighted avg       0.46      0.49      0.44       550



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto', kernel='sigmoid', C=2.5))
clf.fit(x_tokenized, train_data['class'])
y_pred = clf.predict(x_validation_tokenized)

acc = accuracy_score(validation['class'], y_pred)

print(acc)

ValueError: setting an array element with a sequence.