In [4]:
import json
import pandas as pd 
import glob
import os
import re
from bs4 import BeautifulSoup 
from IPython.display import display, HTML
import pickle

In [5]:
## Read in consolidated annotations
#annotations={}
#annofiles=glob.glob("Data/opp115-parsed-annotation-1.0/*.json")
#print len(annofiles)
policyFiles=glob.glob("/share/pub/OPP-115/sanitized_policies/*.html")
print(len(policyFiles))

115


###  Read in the annotations and original policies

In [6]:
def readAnno(filelist):
  annotations={}
  for filename in filelist:
    website=re.sub(".json", '', os.path.basename(filename))
    with open(filename, "r") as f:
      annotations[website]=json.load(f)
    f.close()
  return annotations

def readPolicies(filelist):
  soups={}
  for filename in filelist:
    base=os.path.basename(filename).split("_")[1]
    website=re.sub(".html", '', base)
    soups[website]=BeautifulSoup(open(filename, "r").read(), 'html.parser')
  return soups

In [7]:
with open("Data/parsed-annotation-0.5.pk", 'rb') as f:
  annotations=pickle.load(f, encoding='latin1')

In [8]:
annotations["playstation.com"]["Data Retention"]['Retention Purpose']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,153,18,promotional purpose through one of our websites,106,Marketing
1,201,18,or to make a purchase from the PlayStation Shop,154,Perform service
2,226,22,Email addresses collected from consumers durin...,0,Perform service
3,200,24,so that we may assist these customers with cur...,124,Perform service
4,159,40,necessary to fulfill the purposes outlined in ...,102,Other


In [9]:
### read in sanitized policy texts
policySoups=readPolicies(policyFiles)

In [10]:
## make sure that the website names are the same in both dictinoaries
test1=filter(lambda website: website in annotations.keys(), policySoups.keys())
test2=filter(lambda website: website in policySoups.keys(), annotations.keys())
print(len(list(test1)))
print(len(list(test2)))

115
115


## Get a list of tokens from all of the policies, with frequencies

In [11]:
## Get the texts from beautifulsoup objects

def extractTexts(soups):
  policyTexts={}
  for website in soups:
    policyTexts[website]=soups[website].get_text()
  return policyTexts

In [12]:
## get texts from soup objects
policyTexts=extractTexts(policySoups)

In [13]:
## get the training and validation set 
import random
websites=policyTexts.keys()
seed=124
random.seed(seed)
trainWebsites=random.sample(websites, 105)
valWebstes=[web for web in websites if web not in trainWebsites]

trainTexts={web:policyTexts[web] for web in trainWebsites}

In [14]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re
import itertools
import string
engstop=set(stopwords.words("english"))


In [15]:
print (string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [16]:
from collections import Counter
import itertools


def cleantext(rawtext, remove):
  # remove punctuations
  cleaned3=remove.sub("", rawtext.lower())
  return cleaned3


def getTokens(rawtext, length, stopwords):  # length defines number of words in the token, i.e. unigrams, bigrams ec
  punctuations = re.compile('[%s]' % re.escape(string.punctuation))
  if length==1:
    cleaned=cleantext(rawtext, punctuations)
    tokenlist=word_tokenize(cleaned)
    tokensNoStop=[token for token in tokenlist if token not in stopwords]
    return list(set(tokensNoStop))
  
  if length>=2:
    sentenceList=sent_tokenize(rawtext)
    sentenceClean=[cleantext(sentence, punctuations) for sentence in sentenceList]
    unigramLists=[word_tokenize(sentence) for sentence in sentenceClean]
    
    bigramLists=[zip(*[sentUnigram[i::] for i in range(length)]) for sentUnigram in unigramLists]
    bigrams=list(itertools.chain.from_iterable(bigramLists))
    
    return list(set(bigrams))

In [17]:
def getAllTokens(textDict, ngram, stopwords):
  allTokens=[]
  for website in textDict:
    allTokens.extend(getTokens(textDict[website], ngram, stopwords))
  return allTokens

#### Generate Training data

In [18]:
%%time
trainUnigrams=getAllTokens(trainTexts, 1, engstop)
trainBigrams=getAllTokens(trainTexts, 2, engstop)
trainTrigrams=getAllTokens(trainTexts, 3, engstop)

CPU times: user 8.74 s, sys: 56 ms, total: 8.8 s
Wall time: 8.8 s


In [19]:
## Number of UNIQUE n-grams
print(len(trainUnigrams))
print(len(trainBigrams))
print(len(trainTrigrams))

51734
161586
199155


In [21]:
## Get annotation of the training documents
trainAnno={key:annotations[key] for key in trainTexts.keys()}

In [22]:
hondaSoup=policySoups["honda.com"]
hondaText=hondaSoup.get_text()
hondaText.split("|||")[52][250:360]

" name, email address, and your friend's email address. This information is collected and used only in a manner"

In [24]:
psSoup=policySoups['playstation.com']

In [25]:
annotations['playstation.com']['First Party Collection/Use']["Collection Mode"].keys()

Index(['endIndexInSegment', 'section', 'selectedText', 'startIndexInSegment',
       'value'],
      dtype='object')

In [79]:
#print(annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"].keys())
#print annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]["selectedText"]

print (annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]["startIndexInSegment"][0])
#print annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]['endIndexInSegment'][0]

textinfo=pd.DataFrame({"selectedText":annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]["selectedText"],\
                      "startIndexInSegment": annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]["startIndexInSegment"],\
                      "endIndexInSegment": annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]['endIndexInSegment'],\
                      "value": annotations['playstation.com']['First Party Collection/Use']["Personal Information Type"]['value']})


textinfo=textinfo[["selectedText", "value", "startIndexInSegment", "endIndexInSegment"]]

344


In [80]:
psSections=psSoup.get_text().split("|||")

In [81]:
psSections[6][41:680]

'ionNetwork   In parts of North America and South America, Sony Network Entertainment America Inc. ("SNEA") operates Sony Online Services, a network of online games, movies, music, other media and content and communication services. PlayStation Network ("PSN") is one of these Sony Online Services. With a Sony Online Services or Sony Entertainment Network account, users can purchase goods and services from SNEA through Sony Online Services and may have the opportunity to participate in various network community activities. Users can register for and log into a Sony Entertainment Network account via us.playstation.com. Collection and '

## Determine Relevance of each sentence in training set

### 1. Get the index of each sentences in the policy texts

In [19]:
## Label the sentences in a policy as relevant or irrelevant
## 
# Topic: "Personal Information Type"

In [26]:
def getSentIdx(raw):
  allIdx={}
  secList=raw.split("|||")
  for i in range(len(secList)):
    try:
      secText=secList[i]
    except IndexError:
      print(i)
    secSents=sent_tokenize(secText)
    secIdxLists=[]
    for sent in secSents:
      m=re.search(re.escape(sent), secText)  ## escape to account for quotes in the string
      secIdxLists.append((sent, i, m.start(),m.end()))
    allIdx[i]=secIdxLists
  return allIdx

def getAllIdx(textDict):
  results={}
  for website, text in textDict.items():
    try:
      senidxes=getSentIdx(text)
      results[website]=senidxes
    except IndexError:
      print(website)
  return results


In [27]:
%%time
trainSentIdx=getAllIdx(trainTexts)

CPU times: user 7.18 s, sys: 28 ms, total: 7.21 s
Wall time: 7.21 s


In [28]:
### test to see if the indices are correct
trainAnno["playstation.com"]['First Party Collection/Use']['Personal Information Type'].loc[2]

endIndexInSegment                          912
section                                      8
selectedText           credit card information
startIndexInSegment                        889
value                                Financial
Name: 2, dtype: object

In [29]:
trainSentIdx["playstation.com"][8][3]

('Collection of personal information required to access certain website services may include the collection of date of birth, name, mailing address, email address or credit card information.',
 8,
 709,
 897)

In [30]:
print( re.search("credit card information", trainSentIdx["playstation.com"][8][3][0]).start())
print (re.search("credit card information", trainSentIdx["playstation.com"][8][3][0]).end())

print ("start index in text", 709+164)
print ("end index in text", 709+187)

print ("Length of sentence in Annotation, and raw text", 912-889, 896-873)

## for playstation.com anno indices are 16 characters ahead of copurs indices

164
187
start index in text 873
end index in text 896
Length of sentence in Annotation, and raw text 23 23


In [32]:
trainSentIdx["playstation.com"][8]

[('WHAT WE COLLECT:   Collection of Personal Information through our Websites   We do not require that website visitors reveal any personally identifying information in order to gain general access to our websites.',
  8,
  0,
  211),
 ('However, visitors who do not wish to, or are not allowed by law to share personally identifying information, may not be able to access certain areas of our websites, participate in certain activities, or make a purchase from the PlayStationShop.',
  8,
  212,
  457),
 ('Although personally identifying information may be required to participate in certain promotions or features offered through our websites or to make a purchase from the PlayStationShop, participants provide this information on a voluntary basis only.',
  8,
  458,
  708),
 ('Collection of personal information required to access certain website services may include the collection of date of birth, name, mailing address, email address or credit card information.',
  8,
  709,
  897)]

In [31]:
## Another example:
exampleSite="honda.com"
annoEg=trainAnno[exampleSite]['First Party Collection/Use']['Personal Information Type'].loc[6]
print (annoEg)
selecTextAnno=annoEg["selectedText"]

print (selecTextAnno)

print ("\n")
corpusText=trainSentIdx["honda.com"][annoEg['section']]
print ("Text in corpus : ")
display(corpusText)

print ("\n")
sentenceNumber=0
corpusStart=re.search(selecTextAnno, corpusText[sentenceNumber][0]).start()
corpusEnd= re.search(selecTextAnno, corpusText[sentenceNumber][0]).end()

print ("start index in corpus section :", corpusText[sentenceNumber][2]+corpusStart)
print ("end index in corpus section :", corpusText[sentenceNumber][2]+corpusEnd)



endIndexInSegment                                             246
section                                                         5
selectedText           social security number, and account number
startIndexInSegment                                           204
value                                         Personal identifier
Name: 6, dtype: object
social security number, and account number


Text in corpus : 


[('"Personally identifiable information or personal information" refers to any information that identifies or can be used to contact or locate you such as name, mailing address, phone number, email address, social security number, and account number.',
  5,
  0,
  247),
 ('Use of certain features and tools on this Site requires that you provide us with personally identifiable information.',
  5,
  248,
  365),
 ('However, you always have the option not to provide personal information by choosing not to use a particular feature.',
  5,
  366,
  482)]



start index in corpus section : 204
end index in corpus section : 246


In [32]:
##a third example

In [33]:
## Another example:
exampleSite="uptodate.com"
annoEg=trainAnno[exampleSite]['First Party Collection/Use']['Personal Information Type'].loc[4]
print (annoEg)
selecTextAnno=annoEg["selectedText"]

print (selecTextAnno)

print ("\n")
corpusText=trainSentIdx[exampleSite][annoEg['section']]
sentenceNumber=[idx for idx in range(len(corpusText))][0]

print ("Text in corpus : ")
display(corpusText)

print ("\n")

corpusStart=re.search(selecTextAnno, corpusText[sentenceNumber][0]).start()
corpusEnd= re.search(selecTextAnno, corpusText[sentenceNumber][0]).end()

print ("start index in corpus section :", corpusText[sentenceNumber][2]+corpusStart)
print ("end index in corpus section :", corpusText[sentenceNumber][2]+corpusEnd)

print ("Annotation indices is ahead of corpus indices by {} characters ".format(annoEg["startIndexInSegment"]-corpusStart))


endIndexInSegment                                              119
section                                                          2
selectedText           nformation that specifically identifies you
startIndexInSegment                                             76
value                                 Generic personal information
Name: 4, dtype: object
nformation that specifically identifies you


Text in corpus : 


[('Subscriber Information   UpToDate never automatically collects any information that specifically identifies you such as your name, address, or e-mail address.',
  2,
  0,
  158),
 ('This information is collected only when you voluntarily provide it as part of the subscription process ("Subscriber Information").',
  2,
  159,
  289),
 ('We will ask you whenever we need Subscriber Information that identifies you or allows us to contact you.',
  2,
  290,
  394)]



start index in corpus section : 68
end index in corpus section : 111
Annotation indices is ahead of corpus indices by 8 characters 


In [34]:
trainAnno[exampleSite]['First Party Collection/Use']['Personal Information Type'].loc[4]["startIndexInSegment"]

76

### Gather the sentences and put them in a convenient structure

In [35]:
%%time
# for each selectedText in annotation, search for corresponding sentence in the corpus 
# if a selectedText is more than one sentence long, it will be discarded, since it does not provide much information
# for the importance of words
def labelRel(annotations, sentIdx):
  sentlabels={}
  for website, siteanno in annotations.items():
    sentlabels[website]={}
    siteSents=sentIdx[website]
    for section, sentList in siteSents.items():
      
      sentlabels[website][section]=[list(sentTuple) for sentTuple in sentList]
      for sentEntry in sentlabels[website][section]:
        sentEntry.append([])

    for category in siteanno:
      for topic in siteanno[category]:
        topicFrame=siteanno[category][topic]
        for idx in topicFrame.index:
          if topicFrame.loc[idx]["startIndexInSegment"]!=-1 and topicFrame.loc[idx]["value"]!="Unspecified":
            entry=topicFrame.loc[idx]
            anno_start=entry["endIndexInSegment"]
            anno_end=entry["startIndexInSegment"]
            corpusSents=sentlabels[website][entry["section"]]
            for sent in corpusSents:
              corpus_start=sent[2]
              corpus_end=sent[3]
              if corpus_start <=anno_start and corpus_end >= anno_end:
                sent[4].append((category, topic, entry["value"]))
              elif  abs(corpus_start-anno_start) <20 and abs(corpus_end-anno_end)<20:
                sent[4].append((category,topic, entry["value"]))
              elif anno_start<=corpus_start and anno_end >= corpus_end:
                sent[4].append((category,topic, entry["value"]))
                
                
  return sentlabels

labeledTrainSents=labelRel(trainAnno, trainSentIdx)

CPU times: user 53.4 s, sys: 92 ms, total: 53.5 s
Wall time: 53.4 s


In [36]:
print(labeledTrainSents["playstation.com"][2][2])
egAnno=trainAnno["playstation.com"]['Data Security']['Security Measure']
print(egAnno.loc[egAnno["section"]==2, "selectedText"].values)

['As part of the privacy program, we are subject to frequent audits of our sites and other enforcement and accountability mechanisms administered independently by ESRB.', 2, 467, 633, [('Other', 'Other Type', 'Practice not covered'), ('Data Security', 'Security Measure', 'Privacy/Security program')]]
[ 'To protect your privacy to the maximum extent possible, we have undertaken this privacy initiative and our websites have been reviewed and certified by ESRB Privacy Online to meet established online information collection and use practices. As part of the privacy program, we are subject to frequent audits of our sites and other enforcement and accountability mechanisms administered independently by ESRB.']


In [37]:
### Collect all the sentences , along with their topics, for ease of processing later
### Leave out the value for now

def gatherSentsTopics(labeldIdxSet):
  topiclist=[]
  allsentences=[]
  for website, corpus in labeldIdxSet.items():
    for section, sentLists in corpus.items():
      for sent in sentLists:
        allsentences.append([sent[0], [(item[0], item[1]) for item in sent[4] if item[0]!="Other"]])
        for item in sent[4]:
          topiclist.append((item[0], item[1]))
  return set(topiclist), allsentences

alltopics, allSentences=gatherSentsTopics(labeledTrainSents)

In [38]:
allSentences[2]

['This Privacy Policy explains what information of yours will be collected by Company when you use the Website, any mobile versions of the Website, any applications published by the Company that you download from the Website or from a third-party, and other related services (the "Service"), how the information will be used and how you can control the collection, correction and/or deletion of information.',
 [('First Party Collection/Use', 'Action First-Party'),
  ('First Party Collection/Use', 'Action First-Party'),
  ('First Party Collection/Use', 'Action First-Party'),
  ('First Party Collection/Use', 'Does/Does Not'),
  ('First Party Collection/Use', 'Does/Does Not'),
  ('First Party Collection/Use', 'Does/Does Not')]]

In [108]:
## for each topic, we get alist of related sentences and a list of unrelated sentences

def getTopicRelevanceList(labeledSentences, topiclist):
  topicSentsCollection={topic:{"Related":[], "Unrelated":[]} for topic in topiclist}
  for entry in labeledSentences:
    labelset=set(entry[1])
    for topic in topiclist:
      if topic not in labelset:
        topicSentsCollection[topic]["Unrelated"].append(entry[0])
      else:
         topicSentsCollection[topic]["Related"].append(entry[0])
          
  results={}
  for topic in topiclist:
    results[topic]={}
    results[topic]["Related"]=list(set(topicSentsCollection[topic]["Related"]))
    results[topic]["Unrelated"]=list(set(topicSentsCollection[topic]["Unrelated"]))
  
  return results


relevantSetences=getTopicRelevanceList(allSentences, alltopics)

In [110]:
relevantSetences[('User Choice/Control', 'Purpose')]["Related"][2]

'You may deactivate JavaScript via your browser settings or activate it the same way.'

In [104]:
# Calculate -2log(lambda) scores 
# n is the length of ngram, 
# get O11, O12, O21, and O22 counts, see Lin and Hovy paper
from scipy.stats import binom
def getCounts(relevantSents, ngramList, n):
  #break each sentence to the type of ngrams
  ngramCounts={}
  topiclist=list(relevantSents.keys())
  sentNgram={topic:{"Related":[], "Unrelated":[]} for topic in topiclist}
  for topic, sentMap in relevantSents.items():
    print("Setting Default for topic {}".format(str(topic)))
    ngramCounts.setdefault(topic, {})

    for relevance, sentlist in sentMap.items():
      for sent in sentlist:
        sentNgram[topic][relevance].append(getTokens(sent, length=n,  stopwords=engstop))
    #for ngram in ngramList:
      #ngramCounts[topic].setdefault(ngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
    
  for topic, releNgrams in sentNgram.items():
    print("Calculating Signature Scores for topic {}".format(str(topic)))
    for relSent in releNgrams["Related"]:
      for ngram in ngramList:
        ngramCounts[topic].setdefault(ngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
        if ngram in relSent:
          ngramCounts[topic][ngram]+=Counter({"O11":1})
        else:
          ngramCounts[topic][ngram]+=Counter({"O21":1})
          
    for unrelSent in releNgrams["Unrelated"]:
      for ngram in ngramList:
        ngramCounts[topic].setdefault(ngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
        if ngram in unrelSent:
          ngramCounts[topic][ngram]+=Counter({"O12":1})
        else:
          ngramCounts[topic][ngram]+=Counter({"O22":1})
  
  return ngramCounts

In [None]:
%%time
# Get the scores for Unigrams, Bigrams and Trigrams
unigramCounts=getCounts(relevantSetences, trainUnigrams, 1)
#bigramCounts=getCounts(relevantSetences, trainBigrams, 2)
#bigramCounts=getCounts(relevantSetences, trainTrigrams, 3)

In [117]:
# Calculate -2log(lambda) scores 
# n is the length of ngram, 
# get O11, O12, O21, and O22 counts, see Lin and Hovy paper
from scipy.stats import binom
def getDefault(relevantSents, ngramList, n):
  #break each sentence to the type of ngrams
  ngramCounts={}
  topiclist=list(relevantSents.keys())
  sentNgram={topic:{"Related":[], "Unrelated":[]} for topic in topiclist}
  
  for topic, sentMap in relevantSents.items():
    ngramCounts.setdefault(topic, {})
    for relevance, sentlist in sentMap.items():
      for sent in sentlist:
        sentNgram[topic][relevance].append(getTokens(sent, length=n,  stopwords=engstop))
        
    for ngram in ngramList:
      ngramCounts[topic].setdefault(ngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
  
  return ngramCounts, sentNgram

def getCountsTwo(ngramCounts, sentNgram,ngramList, n):

  ngramSet=set(ngramList)
  for topic, releNgrams in sentNgram.items():
    print("Calculating Signature Scores for topic {}".format(str(topic)))
    for relSent in releNgrams["Related"]:
      for ngram in relSent:
        ngramCounts[topic].setdefault(ngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
        ngramCounts[topic][ngram]+=Counter({"O11":1})
       
      left=ngramSet.difference(set(relSent))
      for otherngram in left:
        ngramCounts[topic].setdefault(otherngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
        ngramCounts[topic][otherngram]+=Counter({"O21":1})
          
    for unrelSent in releNgrams["Unrelated"]:
      for ngram in unrelSent:
        ngramCounts[topic].setdefault(ngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
        ngramCounts[topic][ngram]+=Counter({"O12":1})
        
      unrelLeft=ngramSet.difference(set(unrelSent))
      for otherngram in unrelLeft:
        ngramCounts[topic].setdefault(otherngram, Counter({"O11":0, "O12":0, "O21":0, "O22":0}))
        ngramCounts[topic][otherngram]+=Counter({"O22":1})
  
  return ngramCounts

In [116]:
unigramCountTemplate, sentTokens=getDefault(relevantSetences, trainUnigrams, 1)

In [118]:
%%time
unigramCounts=getCountsTwo(unigramCountTemplate, sentTokens,trainUnigrams, 1)
#bigramCounts=getCounts(relevantSetences, trainBigrams, 2)
#bigramCounts=getCounts(relevantSetences, trainTrigrams, 3)

Calculating Signature Scores for topic ('First Party Collection/Use', 'Choice Type')
Calculating Signature Scores for topic ('Policy Change', 'Notification Type')
Calculating Signature Scores for topic ('User Choice/Control', 'Choice Scope')
Calculating Signature Scores for topic ('Third Party Sharing/Collection', 'User Type')


KeyboardInterrupt: 

In [64]:
unigramCounts[('First Party Collection/Use', 'Choice Type')]

{}

In [2]:
from collections import Counter
a=Counter({"a":4})
a+=Counter({"a":2})
print(a)

Counter({'a': 6})


In [129]:
a=Counter({"a":2, 'b':3})
a.setdefault("a",4)
print(a)

Counter({'b': 3, 'a': 2})


In [None]:
# Calculate -2log(lambda) scores 
# n is the length of ngram, 
# get O11, O12, O21, and O22 counts, see Lin and Hovy paper
from scipy.stats import binom
def getCountsV3(relevantSents, ngramList, n):
  #break each sentence to the type of ngrams
  ngramCounts={}
  topiclist=list(relevantSents.keys())
  ngramSet=set(ngramList)
  sentNgram={topic:{"Related":[], "Unrelated":[]} for topic in topiclist}
  
  for topic, sentMap in relevantSents.items():
    #print("Setting corpus Default for topic {}".format(str(topic)))
    ngramCounts.setdefault(topic, {})
    for relevance, sentlist in sentMap.items():
      for sent in sentlist:
        sentNgram[topic][relevance].append(getTokens(sent, length=n,  stopwords=engstop))
        
    #print("Setting count Default for topic {}".format(str(topic)))
    for ngram in ngramList:
      ngramCounts[topic].setdefault("O11", Counter())
      ngramCounts[topic].setdefault("O12", Counter())
      ngramCounts[topic].setdefault("O21", Counter())
      ngramCounts[topic].setdefault("O22", Counter())
    
    
  for topic, releNgrams in sentNgram.items():
    #print("Calculating Signature Scores for topic {}".format(str(topic)))
    for relSent in releNgrams["Related"]:
      ngramCounts[topic]["O11"]+=Counter({ngram:1 for ngram in relSent})
      ngramleft=ngramSet.difference(set(relSent))
      ngramCounts[topic]["O21"]+=Counter({ngram:1 for ngram in ngramleft})
      
    for unrelSent in releNgrams["Unrelated"]:
      ngramCounts[topic]["O12"]+=Counter({ngram:1 for ngram in unrelSent})
      ngramleftUnrel=ngramSet.difference(set(unrelSent))
      ngramCounts[topic]["O22"]+=Counter({ngram:1 for ngram in ngramleftUnrel}) 
  
  return ngramCounts


