In [133]:
import json
import pandas as pd
import numpy as np
import re
import io
import string
from allennlp.predictors.predictor import Predictor
import en_core_web_sm
from ast import literal_eval
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import os
import logging
from pprint import pprint
from nltk.corpus import framenet 
from nltk.corpus import wordnet
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [134]:
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [135]:
def parsedPolicySegments(policyFile):
    policyData = []
    with open(policyFile) as f:
        policySegments = json.load(f)
    for key,value in policySegments.items():
        wordCount = len(value.split())
        match = re.findall(r'(<[^>]*>)', value)
        if not match and wordCount >= 3:
            policyData.append([key,value])
    
    df = pd.DataFrame(policyData, columns = ['segment_id','statement'])
    return df

In [136]:
def selectPolicySentences(segmentDf, selections):
    selectedPolicies = []
    for i, row in segmentDf.iterrows():
        segmentId = row['segment_id'] 
        statement = row['statement']
        if int(segmentId.split(",", 1)[0]) in selections:
            selectedPolicies.append([segmentId, statement])
    
    df = pd.DataFrame(selectedPolicies, columns = ['segment_id','statement'])
    return df

In [137]:
def parsedAppPolicyData(filePath):
    tokenizedSatatements = []
    for filename in os.listdir(filePath):
        with open(filePath+filename) as f:
            for sentence in f:
                tokens = preprocess_string(sentence)
                tokenizedSatatements.append(tokens)
    return tokenizedSatatements

In [138]:
# def cleanSegments(segmentDf):
#     for i, row in segmentDf.iterrows():
#         url_free = re.sub(r'(<[^>]*>.*?<[^>]*>)', '', row['statement'], flags=re.MULTILINE)
#         segmentDf.at[i,'statement'] = url_free
#     return segmentDf

In [139]:
def preProcessedSegments(segmentDf):
    tokenizedSatatements = []
    for i, row in segmentDf.iterrows():
        tokens = preprocess_string(row['statement'])
        tokenizedSatatements.append(tokens)
    return tokenizedSatatements

In [140]:
def trainWordEmbedding(tokenizedSatatements):
    model = Word2Vec(tokenizedSatatements, size=100, window=5, min_count=5, workers=4,sg=0);
    model.save('word_embedding_model/word2vec_app_policy_modal.model')

In [141]:
def loadFastTextModel(filePath):
    model = Word2Vec.load(filePath)
    print(model.wv.most_similar("use"))

In [142]:
def loadGoogleDataModel(filePath):
    model = KeyedVectors.load_word2vec_format(filePath, binary=True)
    print(model.most_similar('use'))

In [143]:
# def extractor(segmentDf):
# #     slrPredictor = Predictor.from_path("/home/lahiru/Research/policy_analysis/src/scripts/commitment_analysis/bert-base-srl-2019.06.17.tar.gz")
# #     nerPredictor = predictor = Predictor.from_path("/home/lahiru/Research/policy_analysis/src/scripts/commitment_analysis/ner-model-2018.12.18.tar.gz")
# #     depandencyPredictor = Predictor.from_path("/home/lahiru/Research/policy_analysis/src/scripts/commitment_analysis/biaffine-dependency-parser-ptb-2018.08.23.tar.gz")
#     j = 0;
#     for i, row in segmentDf.iterrows():
#         print(j)
#         sentence = row['statement']
#         print(sentence)
#         if(pd.notna(sentence) and sentence.strip() != ""):
#             if(pd.notna(row['slr_relation'])):
#                 segmentDf.at[i,'slr_based_class'] = str(annotatedBySRL(row['slr'], row['ner'])
# #             segmentDf.at[i,'slr'] = extractSLR(sentence, slrPredictor)
# #             segmentDf.at[i,'actor'] = extractActor(sentence)
# #             segmentDf.at[i,'scope'] = extractScope(sentence)
# #             dependancyTree = extractDependancy(sentence, depandencyPredictor)
# #             segmentDf.at[i,'dependancy'] = str(dependancyTree)
# #             segmentDf.at[i,'ner'] = str(nameEntityRecognition(sentence, nerPredictor))
# #             result = {}
# #             result['children'] = []
# #             segmentDf.at[i,'concept'] = extractConcept(literal_eval(row['dependancy']), result)
# #             segmentDf.at[i,'slr_relation'] = str(extractSLRRelation(row['slr'], row['dependancy']))
# #             if(pd.notna(row['slr_relation'])):
# #                segmentDf.at[i,'class'] = str(annotate(row['slr_relation']))
# #                 arributes = extractAttributes(row['slr_relation'])
# #                 if arributes is not None:
# #                     segmentDf.at[i,'attributes'] = str(arributes)
# #                     segmentDf.at[i,'actor'] = str(arributes['actor']) 
# #                     segmentDf.at[i,'action'] = str(arributes['action'])
# #                     segmentDf.at[i,'object'] = str(arributes['object']) 
# #                     segmentDf.at[i,'object_source'] = str(arributes['object_source'])
# #                     segmentDf.at[i,'target'] = str(arributes['target'])
# #                     segmentDf.at[i,'purpose'] = str(arributes['purpose'])        
# #             if(pd.notna(row['action'])):
# #                 segmentDf.at[i,'lex_name'] = str(getLexName(row['action']))
#         j+=1
#     return segmentDf

In [144]:
def extractor(segmentDf):
#     slrPredictor = Predictor.from_path("/home/lahiru/Research/policy_analysis/src/scripts/commitment_analysis/bert-base-srl-2019.06.17.tar.gz")
#     nerPredictor = predictor = Predictor.from_path("/home/lahiru/Research/policy_analysis/src/scripts/commitment_analysis/ner-model-2018.12.18.tar.gz")
#     depandencyPredictor = Predictor.from_path("/home/lahiru/Research/policy_analysis/src/scripts/commitment_analysis/biaffine-dependency-parser-ptb-2018.08.23.tar.gz")

    j=0
    for i, row in segmentDf.iterrows():
        print(j)
        sentence = row['statement']
        print(sentence)
        if(pd.notna(sentence) and sentence.strip() != ""):
            
            segmentDf.at[i,'slr'] = extractSLR(sentence, slrPredictor)
            
            dependancyTree = extractDependancy(sentence, depandencyPredictor)
            segmentDf.at[i,'dependancy'] = str(dependancyTree)
            segmentDf.at[i,'ner'] = str(nameEntityRecognition(sentence, nerPredictor))

#             segmentDf.at[i,'slr_relation'] = str(extractSLRRelation(sentence, row['slr'], row['dependancy']))

#             if(pd.notna(row['slr_relation'])):
#                 segmentDf.at[i,'slr_based_class'] = str(annotatedBySRL(row['slr'], row['ner']))

#             if(pd.notna(row['slr_relation'])):
#                 arributes = extractAttributes(row['slr_relation'])
#                 if arributes is not None:
#                     segmentDf.at[i,'attributes'] = str(arributes)
#                     segmentDf.at[i,'actor'] = str(arributes['actor']) 
#                     segmentDf.at[i,'action'] = str(arributes['action'])
#                     segmentDf.at[i,'object'] = str(arributes['object']) 
#                     segmentDf.at[i,'object_source'] = str(arributes['object_source'])
#                     segmentDf.at[i,'target'] = str(arributes['target'])
#                     segmentDf.at[i,'purpose'] = str(arributes['purpose'])
#                     segmentDf.at[i,'conditions'] = str(arributes['conditions'])
                    
#             if(pd.notna(row['action'])):
#                 segmentDf.at[i,'lex_name'] = str(getLexName(row['action']))
        j+=1
    return segmentDf

In [145]:
def extractActor(sentence):
    nlp = en_core_web_sm.load();
    pronoun = [x.orth_ for x in [y for y in nlp(sentence) if y.pos_ == 'PRON']]
    duplicateRemoved = list(set(pronoun))
    return str(duplicateRemoved)

def extractSLR(sentence, predictor):
    try:
        return str(predictor.predict(sentence))
    except:
        pass

def extractScope(sentence):
    return str(re.findall(r'\b(?:law|regulation)\b', sentence))

def extractDependancy(sentence, predictor):
    return predictor.predict(sentence)['hierplane_tree']['root']

'''Extract concept from the dependacy parsing'''
def extractConcept(dependancyTree, result):
    #check root lement exist in the tree
    if 'root' in dependancyTree.values():
        result['root'] = dependancyTree['word']
        
    #check if children element exist in the tree
    if 'children' in dependancyTree:
        childrens = dependancyTree['children']
        for children in childrens:
            #check whether the nodetype is nsoubj or aux
            if children['nodeType'] == 'nsubj' or children['nodeType'] == 'aux' or children['nodeType'] == 'poss':
                result['children'].append({children['nodeType']:children['word']})
            #if node has another children recursively call the funtions 
            if 'children' in children:
                newChild = {'children':children['children']}
                extractConcept(newChild, result)
                
    return str(result)

def extractSLRRelation(sentence,slr,depandancy):
    depandancyDic = literal_eval(depandancy)
    slrDic = literal_eval(slr)
    count = 0
    verbPosition = 0
    if 'word' in depandancyDic:
        spans = depandancyDic['spans']
        end = spans[0]['end']
        sentenceSpan = sentence[:end]
        words = sentenceSpan.split()
        rootWord = depandancyDic['word']
        for word in words:
            if word.strip() == rootWord:
                count += 1
        for verb in slrDic['verbs']:
            if verb['verb'] == rootWord:
                verbPosition += 1
            if verb['verb'] == rootWord and count <= 1:
                return verb
            elif verb['verb'] == rootWord and verbPosition == count:
                print(verb)
                return verb

# def extractPUC(slr):
#     pronoun = ['you','your']
#     healpingVerb = ['must']
            
# def extractPUP(slr):

# def extractPOC(slr):
    
# def extractPOP(slr):

def nameEntityRecognition(sentence, predictor):
    try:
        prediction = predictor.predict(sentence)
        tags = prediction['tags']            
        words = prediction['words']
    
        for i, tag in enumerate(tags):
            if tag == 'U-ORG':
                return words[i];
    except:
        pass
    
def annotate(slr):
    slrDic = literal_eval(slr)
    if 'verb' in slrDic:
        description = slrDic['description']
        entities = re.findall(r'([a-zA-Z0-9- ]*:[a-zA-Z0-9\- ]*)', description)
        print(entities)
        dic = dict(s.split(':') for s in entities)
        
        arg0 = ""
        mod = ""
        verb = ""
        if dic.get('ARG0') is not None:
            arg0 = dic.get('ARG0').strip().lower()
        if dic.get('ARGM-MOD') is not None:
            mod = dic.get('ARGM-MOD').strip().lower()
        if dic.get('V') is not None:
            verb = dic.get('V').strip().lower()
        
        if arg0 in ['you','your'] and mod in ['must', 'will', 'would']:
            return 'PUC'
        elif arg0 in ['you', 'your'] and mod in ['may','might','can','could']:
            return 'PUPR'
#         elif arg0 in ['you', 'your'] and verb is not None:
#             return 'PUPR'
        elif arg0 in ['we', 'us', 'our'] and mod in ['must', 'will', 'would']:
            return 'POC'
#         elif arg0 in ['we', 'us', 'our'] and verb is not None:
#             return 'POC'
        elif arg0 in ['we', 'us', 'our'] and mod in ['may','might','can','could']:
            return 'POPR'
        
        if arg0 in ['you','your'] and verb in ['agree', 'concent']:
            return 'PUC'

In [146]:
def annotatedBySRL(srl, ner):
    slrDic = literal_eval(srl)
    categories = []
    org = ner.strip().lower()
    if 'verbs' in slrDic:
        verbs = slrDic['verbs']
        for verb in verbs:
            description = verb['description']
            entities = re.findall(r'([a-zA-Z0-9- ]*:[a-zA-Z0-9\- ]*)', description)
            dic = dict(s.split(':') for s in entities)
            
            arg0 = ""
            mod = ""
            verb = ""
            if dic.get('ARG0') is not None:
                arg0 = dic.get('ARG0').strip().lower()
            if dic.get('ARGM-MOD') is not None:
                mod = dic.get('ARGM-MOD').strip().lower()
            if dic.get('V') is not None:
                verb = dic.get('V').strip().lower()

            if arg0 in ['you','your'] and mod in ['must', 'will', 'would']:
                categories.append('PUC')
            elif arg0 in ['you', 'your'] and mod in ['may','might','can','could']:
                categories.append('PUPR')
    #         elif arg0 in ['you', 'your'] and verb is not None:
    #             return 'PUPR'
            elif arg0 in ['we', 'us', 'our'] and mod in ['must', 'will', 'would']:
                categories.append('POC')
    #         elif arg0 in ['we', 'us', 'our'] and verb is not None:
    #             return 'POC'
            elif arg0 in ['we', 'us', 'our'] and mod in ['may','might','can','could']:
                categories.append('POPR')

            if arg0 in ['you','your'] and verb in ['agree', 'concent', 'permit', 'required', 'submit', 'register', 'agreement', 'disclose']:
                categories.append('PUC') 
            elif arg0 in ['you', 'your'] and verb in ['choose', 'are']:
                categories.append('PUPR')
            elif arg0 in ['we', 'us', 'our'] and verb in ['use', 'uses', 'perform', 'collect', 'allow', 'provides', 'gather']:
                categories.append('POPR')
            elif arg0 in ['we', 'us', 'our'] and verb in ['store', 'maintains']:
                categories.append('POC')
            
            if arg0 == org and mod in ['must', 'will', 'would']:
                categories.append('POC')
            elif arg0 == org and mod in ['may','might','can','could']:
                categories.append('POPR')
            elif arg0 == org and verb in ['store', 'maintains']:
                categories.append('POC')
            elif arg0 == org and verb in ['use', 'uses', 'perform', 'collect', 'allow', 'provides', 'gather']:
                categories.append('POPR')
                
    if categories:
        return categories[0]
            

In [147]:
def extractAttributes(srl):
    slrDic = literal_eval(srl)
    if slrDic is not None:
#         verbs = slrDic['verb']
            attributes = {}
            description = slrDic['description']
            entities = re.findall(r'([a-zA-Z0-9- ]*:[a-zA-Z0-9\- ]*)', description)
            dic = dict(s.split(':') for s in entities)
            
            arg0 = ""
            verb = ""
            arg1 = ""
            arg2 = ""
            argm_prp = ""
            argm_adv = ""
            
            if dic.get('ARG0') is not None:
                arg0 = dic.get('ARG0').strip().lower()
            if dic.get('V') is not None:
                verb = dic.get('V').strip().lower()
            if dic.get('ARG1') is not None:
                arg1 = dic.get('ARG1').lower()
            if dic.get('ARG2') is not None:
                arg2 = dic.get('ARG2').lower()
            if dic.get('ARGM-PRP') is not None:
                argm_prp = dic.get('ARGM-PRP').lower()
            if dic.get('ARGM-ADV') is not None:
                argm_adv = dic.get('ARGM-ADV').lower()
                
            if arg0 != "":
                attributes['actor'] = arg0
                attributes['action'] = verb
                attributes['object'] = arg1
#                 with,to, for preposition for identifying the target
#                check arg1 for object source
                if re.findall(r'\byour|you\b', arg1):
                    attributes['object_source'] = "user(s)"
                elif re.findall(r'\bour|us\b', arg1):
                    attributes['object_source'] = "organization"
                else:
                    attributes['object_source'] = ""
                attributes['target'] = arg2     
                attributes['purpose'] = argm_prp
                attributes['conditions'] = argm_adv
                return attributes

In [148]:
def getFrame(verb):
    return framenet.frames(verb)

In [149]:
'''https://wordnet.princeton.edu/documentation/lexnames5wn'''
def getLexName(verb):
    synsets = wordnet.synsets(verb, pos=wordnet.VERB)
    for synset in synsets:
        return synset.lexname()

In [150]:
# def annotatedByDepandancy(depandancy):
#     depandancyDic = literal_eval(depandancy)
#     if 'children' in depandancyDic:
        

In [151]:
def printStatistics(df):
    
#     print("===========Corpus Information===============")
#     print("Word count: {}".format())
    
    print("===========Instance Count===============")
    instaceCount = df.groupby('annotated_class').count()['statement']
    print(instaceCount)
    print("Actor organization:{}".format(instaceCount[0] + instaceCount[1]))
    print("Actor user:{}".format(instaceCount[2] + instaceCount[3]))
    
    srlClassed = df[df['slr_based_class'] != "None"]
    print("===========Conusion Matrix===============")
    coverage = (srlClassed.shape[0]/100)*100
    print("Coverage:{}%".format(coverage))
    

    actual = srlClassed['annotated_class']
    predicted = srlClassed['slr_based_class']
    print(confusion_matrix(actual,predicted))
    print("Accuray:{}".format(accuracy_score(actual, predicted)))
    print(classification_report(actual,predicted))
    
    print("===========Attribute details===============")
    print(df.groupby('action').count()['statement'].sort_values(ascending=False))
    print(df.groupby('lex_name').count()['action'].sort_values(ascending=False))
    
    matchedActor = 0
    matchedAction = 0
    matchedObject = 0
    matchedObjectSource = 0
    matchedTarget = 0
    matchedPurpose = 0
    matchedCondition = 0
    
    replacedDf = df.replace(np.nan, '', regex=True)
    for i, row in replacedDf.iterrows():
        print(row['a_target'])
        print(row['target'])
        if row['a_actor'] != "" and row['actor'] == row['a_actor']:
            matchedActor+=1
        if row['a_action'] != "" and row['action'].strip() == row['a_action'].strip():
            matchedAction+=1
        if row['a_object'] != "" and row['a_object'] == row['object']:
            matchedObject+=1
        if row['a_object_source'] != "" and row['a_object_source'] == row['object_source']:
            matchedObjectSource+=1
        if row['a_target'] != "" and row['a_target'] == row['target']:
            matchedTarget+=1
        if row['a_purpose'] != "" and row['a_purpose'] == row['purpose']:
            matchedPurpose+=1
        if row['a_condition'] != "" and row['a_condition'] in row['conditions']:
            matchedCondition+=1
    
    print("================Matched Attribute Count==================")
    print("Actor:{}".format(matchedActor))
    print("Action:{}".format(matchedAction))
    print("Object:{}".format(matchedObject))
    print("Object Source:{}".format(matchedObjectSource))
    print("Target:{}".format(matchedTarget))
    print("Purpose:{}".format(matchedPurpose))
    print("Condition:{}".format(matchedCondition))
    

In [152]:
# dataFolderPath = "/home/lahiru/Research/policy_analysis/data/usableprivacy/OptOutChoice-2017_v1.0/SentenceDict.json"
# segmentDf = parsedPolicySegments(dataFolderPath).sample(n=100)
# cleanSegments = cleanSegments(segmentDf)
# filtredSegments =cleanSegments[cleanSegments['statement'].str.strip() != ""].sample(n=1000)
# segmentDf.to_csv('processed_privacy_policy_segments_sample_100_4.csv',',')

# extractSLR(cleanSegments)
# extractedSegments = extractActor(cleanSegments)
# extractedSegments.to_csv('processed_privacy_policy_segments.csv',',')

# segmentDf = pd.read_csv('processed_privacy_policy_segments_sample_2000_1.csv')
# extractedSegments = extractor(segmentDf)
# extractedSegments.to_csv('processed_privacy_policy_segments_sample_2000_1.csv',',')

# annotatedStatement = pd.read_csv('annotated_privacy_policy_segments_100_nlp_pipeline.csv')
# extractedSegments = extractor(annotatedStatement)
# extractedSegments.to_csv('annotated_privacy_policy_segments_100_nlp_pipeline.csv',',')
# printStatistics(annotatedStatement)

# tokenizedSatatements = preProcessedSegments(cleanSegments)
# trainWordEmbedding(tokenizedSatatements)
# loadFastTextModel('word_embedding_model/word2vec_policy_modal.model')
# loadGoogleDataModel('word_embedding_model/GoogleNews-vectors-negative300.bin')

# tokenizedAppPolicy = parsedAppPolicyData('/home/lahiru/Research/policy_analysis/src/scripts/crawl_privacy_page/app_privacy/')
# trainWordEmbedding(tokenizedAppPolicy)
# loadFastTextModel('word_embedding_model/word2vec_app_policy_modal.model')
# segmentDf = pd.read_csv('processed_privacy_policy_segments_sample_2000_1.csv')
# dfClass = segmentDf[(segmentDf['slr_based_class'] != 'None') & (segmentDf['slr_based_class'] != null)]
# dfClass.to_csv('processed_privacy_policy_segments_sample_2000_1_class.csv',',')

# dfNone = segmentDf[(segmentDf['slr_based_class'] == 'None') | (segmentDf['slr_based_class'] == ' ')]
# dfNone.to_csv('processed_privacy_policy_segments_sample_2000_1_None.csv',',')

new_data = ["We use cookies on this Site to ensure the integrity of the registration process and to personalize the Site.",
            "Our Sites' registration system requires users to give us various Personal Data, such as their name and e-mail address, ZIP code, sex, age or income level",
            "We may change the terms of this Notice at any time.",
            "Generally, you can access and browse our Web sites without disclosing any personally identifiable information.",
            "E-mail Alert Service Registration Form: If you complete the E- mail Alert Service Registration Form, you will receive e-mail alerts regarding new press releases posted to the Web Site(s).",
           "From time to time, we may request personal information from you at our sites in order to deliver requested materials to you."]

# df = pd.DataFrame(new_data, columns = ['statement'])

# df = pd.read_csv('processed_privacy_policy_segments_paper.csv')
# extractedSegments = extractor(df)
# extractedSegments.to_csv('processed_privacy_policy_segments_paper.csv',',')

#=====================================Diversify experiment==============================================
# dataFolderPath = "/home/lahiru/Research/policy_analysis/data/usableprivacy/OptOutChoice-2017_v1.0/SentenceDict.json"
# segmentDf = parsedPolicySegments(dataFolderPath)
# newsWebsites = [481,862,1089,1360,1361,1637,1683]
# healthWebsites = [202,517,581,642,884,891,898,1221]
# shoppingWebsites = [640,807,1498]

# newsWebsiteDf = selectPolicySentences(segmentDf, newsWebsites)
# print(newsWebsiteDf)
# newsWebsiteDf.to_csv('selected_website_analysis/privacy_policy_segments_news_websites.csv',',')

# healthWebsiteDf = selectPolicySentences(segmentDf, healthWebsites)
# healthWebsiteDf.to_csv('selected_website_analysis/privacy_policy_health_websites.csv',',')

# shoppingWebsiteDf = selectPolicySentences(segmentDf, shoppingWebsites)
# shoppingWebsiteDf.to_csv('selected_website_analysis/privacy_policy_shopping_websites.csv',',')

# annotatedNewsPolicy = pd.read_csv('selected_website_analysis/privacy_policy_segments_news_websites_nlp_pipeline.csv')
# extractedNewsPolicy = extractor(annotatedNewsPolicy)
# extractedNewsPolicy.to_csv('selected_website_analysis/privacy_policy_segments_news_websites_nlp_pipeline.csv',',')

# annotatedHealthPolicy = pd.read_csv('selected_website_analysis/privacy_policy_segments_health_websites_nlp_pipeline.csv')
# extractedHealthPolicy = extractor(annotatedHealthPolicy)
# extractedHealthPolicy.to_csv('selected_website_analysis/privacy_policy_segments_health_websites_nlp_pipeline.csv',',')

# annotatedShoppingPolicy = pd.read_csv('selected_website_analysis/privacy_policy_segments_shopping_websites_nlp_pipeline.csv')
# extractedShoppingPolicy = extractor(annotatedShoppingPolicy)
# extractedShoppingPolicy.to_csv('selected_website_analysis/privacy_policy_segments_shopping_websites_nlp_pipeline.csv',',')

# resultNewsPolicy = annotatedNewsPolicy.groupby('slr_based_class').count()['statement']
# print(resultNewsPolicy)

# resultHelathPolicy = annotatedHealthPolicy.groupby('slr_based_class').count()['statement']
# print(resultHelathPolicy)

# resultShoppingPolicy = annotatedShoppingPolicy.groupby('slr_based_class').count()['statement']
# print(resultShoppingPolicy)

#=====================================Specific website==============================================
# dataFolderPath = "/home/lahiru/Research/policy_analysis/data/usableprivacy/OptOutChoice-2017_v1.0/SentenceDict.json"
# segmentDf = parsedPolicySegments(dataFolderPath)

# everyDayHealthDF = selectPolicySentences(segmentDf, [891])
# print(everyDayHealthDF)
# everyDayHealthDF.to_csv('selected_website_analysis/privacy_policy_segments_everydayhealth_website.csv',',')

annotatedEveryDayHealthPolicy = pd.read_csv('selected_website_analysis/privacy_policy_segments_everydayhealth_website_nlp_pipeline.csv')
# extractedEveryDayHealthPolicy = extractor(annotatedEveryDayHealthPolicy)
# extractedEveryDayHealthPolicy.to_csv('selected_website_analysis/privacy_policy_segments_everydayhealth_website_nlp_pipeline.csv',',')

resultNewsPolicy = annotatedEveryDayHealthPolicy.groupby('slr_based_class').count()['statement']
print(resultNewsPolicy)

slr_based_class
None    255
POC       4
POPR     39
PUC      36
PUPR     16
Name: statement, dtype: int64
