In [17]:
import os
import json
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import classification_report

In [18]:
def readPolicyFile(fileLocation):
    policySegments = []
    for filename in os.listdir(fileLocation):
        absFilename = "{}/{}".format(fileLocation,filename)
        with open(absFilename) as csv_file:
            #print absFilename
            categoryId = 0
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                if row[5] == "First Party Collection/Use":
                    categoryId = 1
                elif row[5] == "Third Party Sharing/Collection":
                    categoryId = 2
                elif row[5] == "User Choice/Control":
                    categoryId = 3
                elif row[5] == "User Access, Edit and Deletion":
                    categoryId = 4
                elif row[5] == "Data Retention":
                    categoryId = 5
                elif row[5] == "Data Security":
                    categoryId = 6
                elif row[5] == "Policy Change":
                    categoryId = 7
                elif row[5] == "Do Not Track":
                    categoryId = 8 
                elif row[5] == "International and Specific Audiences":
                    categoryId = 9
                elif row[5] == "Introductory/Generic":
                    categoryId = 10
                elif row[5] == "Privacy contact information":
                    categoryId = 11
                elif row[5] == "Practice not covered":
                    categoryId = 12
                else:
                    continue
                    
                policySegment = ''
                jsonData=json.loads(row[6])
                for (k, v) in jsonData.items():
                    for (k, v) in v.items():
                        if k == 'selectedText':
                            policySegment = ''.join(v)
                
                policySegments.append([policySegment, categoryId, row[5]])
            #print policySegments
            #print policySegments
    df = pd.DataFrame(policySegments, columns = ['text', 'label', 'label_name'])
    return df

In [19]:
def cleanDocs(dataFrame):
    cleanNull = dataFrame[df.text != 'null'].reset_index(drop=True)
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    clean_docs = []
    bigram_docs = []
    for index, entry in enumerate(cleanNull['text']):
        stop_free = " ".join([i for i in entry.lower().split() if i not in stop])
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
        digit_free = [word for word in punc_free.split() if not word.isdigit() and len(word) > 2]
        normalized = " ".join(lemma.lemmatize(word) for word in digit_free)
        nouns = [word[0] for word in nltk.pos_tag(normalized.split()) if word[1] == 'NN' or word[1] == 'VB']
        cleanNull.loc[index,'text_final'] = str(nouns)

	#bigram_transformer = phrases.Phrases(clean_docs)
	
	#for doc in bigram_transformer[clean_docs]:
	#		bigram_docs.append(doc)
    cleanEmpty = cleanNull[cleanNull.text_final != '[]']
    return cleanEmpty

In [20]:
def loadTestDataset(fileName):
    df = pd.read_csv(fileName)
    return df

In [21]:
def buildModel(Corpus):
    Train_data, Test_data, Train_label, Test_label = train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)
    #Encoder = preprocessing.LabelEncoder()
    #Train_label = Encoder.fit_transform(Train_label)
    #Test_label = Encoder.fit_transform(Test_label)
    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(Corpus['text_final'])
    Train_data_Tfidf = Tfidf_vect.transform(Train_data)
    Test_data_Tfidf = Tfidf_vect.transform(Test_data)
    
#     print(Tfidf_vect.vocabulary_)
    
    # fit the training dataset on the NB classifier
#     Naive = MultinomialNB()
#     Naive.fit(Train_data_Tfidf,Train_label)
#     # predict the labels on validation dataset
#     predictions_NB = Naive.predict(Test_data_Tfidf)
#     # Use accuracy_score function to get the accuracy
#     print(classification_report(Test_label, predictions_NB))
#     print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_label)*100)
    
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_data_Tfidf,Train_label)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(Test_data_Tfidf)
    #print Test_data_Tfidf
    #print predictions_SVM
    # Use accuracy_score function to get the accuracy
    print(classification_report(Test_label, predictions_SVM))
    print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_label)*100)
    
    webPolicyCorpus = loadTestDataset('topic_10_only_nouns_n_verb_run_1.csv')
    webPolicy_TFidf = Tfidf_vect.transform(webPolicyCorpus['0'])
    webPolicyPredition = SVM.predict(webPolicy_TFidf)
    print webPolicyPredition
    webPolicyDf = mergeData(webPolicyCorpus, webPolicyPredition)
    webPolicyDf.to_csv('topic_10_only_nouns_n_verb_run_1_prediction.csv', index=False)
    
    gdprPolicyCorpus = loadTestDataset('topic_10_gdpr_only_nouns_n_verb_run_1.csv')
    gdprPolicy_TFidf = Tfidf_vect.transform(gdprPolicyCorpus['0'])
    gdprPolicyPredition = SVM.predict(gdprPolicy_TFidf)
    gdprPolicyDf = mergeData(gdprPolicyCorpus, gdprPolicyPredition)
    gdprPolicyDf.to_csv('topic_10_gdpr_only_nouns_n_verb_run_1_prediction.csv', index=False)
    
    return SVM

In [22]:
def predictLable(model, corpus):
    print corpus
    Tfidf_vect = TfidfVectorizer(max_features=5000)
    webPolicy_TFidf = Tfidf_vect.transform(corpus['0'])
    webPolicyPredition = model.predict(webPolicy_TFidf)
    
    return webPolicyPredition;

In [23]:
def mergeData(corpus, predictedResult):
    labels = [[1, 'First Party Collection/Use'], 
              [2, 'Third Party Sharing/Collection'], 
              [3, 'User Choice/Control'], 
              [4, 'User Access, Edit and Deletion'], 
              [5, 'Data Retention'],
              [6, 'Data Security'],
              [7, 'Policy Change'], 
              [8, 'Do Not Track'],
              [9, 'International and Specific Audiences'],
              [10, 'Introductory/Generic'],
              [11, 'Privacy contact information'],
              [12, 'Privacy contact information']]
    
    dfLabel = pd.DataFrame(labels, columns=['label', 'discription'])
    dfPredictedResult = pd.DataFrame(predictedResult)
    dfContact = pd.concat([corpus, dfPredictedResult], axis=1)
    dfContact.columns = ['topic_number', 'corpus', 'label'] 
    return pd.merge(dfContact, dfLabel, on='label')

In [24]:
np.random.seed(500)
fileLocation = '/home/lahiru/Research/policy_analysis/data/usableprivacy/OPP-115/annotations'

df = readPolicyFile(fileLocation)
Corpus = cleanDocs(df)
print Corpus['label_name'].unique()
Corpus.to_csv('clean_OOP-115_policy_corpus.csv', index=False)
model = buildModel(Corpus)

# webPolicyCorpus = loadTestDataset('topic_10_only_nouns_n_verb_run_1.csv')
# gdprPolicyCorpus = loadTestDataset('topic_10_gdpr_only_nouns_n_verb_run_1.csv')

# webPolicyPrediction = predictLable(model, webPolicyCorpus)
# gdprPolicyPrediction = predictLable(model, gdprPolicyCorpus)

# mergeData(webPolicyCorpus, webPolicyPrediction)


['First Party Collection/Use' 'Data Retention' 'User Choice/Control'
 'User Access, Edit and Deletion' 'Third Party Sharing/Collection'
 'Data Security' 'International and Specific Audiences' 'Policy Change'
 'Do Not Track']


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           1       0.62      0.90      0.73      1765
           2       0.84      0.81      0.83      1377
           3       0.77      0.37      0.50       497
           4       0.88      0.07      0.13       222
           5       0.00      0.00      0.00       110
           6       0.95      0.69      0.80       289
           7       0.94      0.51      0.66       166
           8       1.00      0.16      0.27        19
           9       0.89      0.82      0.86       251

   micro avg       0.72      0.72      0.72      4696
   macro avg       0.77      0.48      0.53      4696
weighted avg       0.75      0.72      0.70      4696

('Naive Bayes Accuracy Score -> ', 72.38074957410562)
              precision    recall  f1-score   support

           1       0.68      0.84      0.75      1765
           2       0.82      0.85      0.83      1377
           3       0.70      0.46      0.56       497
           4       0.71 