In [110]:
#The example of analysising policy corpus with LDA

In [111]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
from gensim.models import ldamodel
from gensim.models import phrases
import pyLDAvis.gensim
import logging
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import io

In [112]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [113]:
def parsedPolicySegments(policyFile):
	policyData = []
	with open(policyFile) as f:
		policySegments = json.load(f)
	
	for key,value in policySegments.iteritems():
		policyData.append(value)
	
	return policyData

In [114]:
def readTxtFile(fileName):
	policyData = []
        with io.open(fileName, 'r', encoding="utf-8") as f:
		for line in f:
			policyData.append(line)
	
	return policyData

In [115]:
def findTermFrequence(policyDocuments):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(policyDocuments)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    return df

In [116]:
def cleanDocsAsSentences(posts):
	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation) 
	lemma = WordNetLemmatizer()
	clean_docs = []
	bigram_docs = []
	for post in posts: 
	    stop_free = " ".join([i for i in post.lower().split() if i not in stop])
	    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	    digit_free = [word for word in punc_free.split() if not word.isdigit() and len(word) > 2]
	    normalized = " ".join(lemma.lemmatize(word) for word in digit_free)
	    nouns = " ".join(word[0] for word in nltk.pos_tag(normalized.split()) if word[1] == 'NN' or word[1] == 'VB')
	    clean_docs.append(nouns)

	return clean_docs 

In [117]:
def cleanDocs(posts):
	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation) 
	lemma = WordNetLemmatizer()
	clean_docs = []
	bigram_docs = []
	for post in posts:
            url_free = re.sub(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', '', post, flags=re.MULTILINE)
	    stop_free = " ".join([i for i in url_free.lower().split() if i not in stop])
	    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	    digit_free = [word for word in punc_free.split() if not word.isdigit() and len(word) > 2]
	    normalized = " ".join(lemma.lemmatize(word) for word in digit_free)
	    nouns = [word[0] for word in nltk.pos_tag(normalized.split()) if word[1] == 'NN' or word[1] == 'VB']
	    clean_docs.append(nouns)

	#bigram_transformer = phrases.Phrases(clean_docs)
	
	#for doc in bigram_transformer[clean_docs]:
	#		bigram_docs.append(doc)

	return clean_docs 

In [118]:
def buildLADModel(clean_docs, model_name):
    # Creating the term dictionary of our courpus, where every unique term is assigned an index.
    dictionary = corpora.Dictionary(clean_docs)
    dictionary.save(model_name + '.dict')

    doc_term_matrix = []

    for doc in clean_docs:
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrix.append(dictionary.doc2bow(doc))

    corpora.MmCorpus.serialize(model_name + '.mm', doc_term_matrix)

    lda_model = ldamodel.LdaModel(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

    lda_model.save(model_name + '.model') 
    
    print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix))

    #print(lda_model.print_topics(num_topics=3, num_words=3))

In [119]:
def loadModel(modelname):
	lda_model = ldamodel.LdaModel.load(modelname)
	csvfile = open(modelname + '.txt', 'wb')

	#print(lda_model.print_topics(num_topics=100, num_words=10))
	for topic in range(lda_model.num_topics):
		print('\n')
		csvfile.write("Topic: " + str(topic)) 
		for word in lda_model.show_topic(topic, topn=50):
			csvfile.write(word)

In [120]:
def visulaizeModel(corpusfile, dictionaryfile, modelfile, visfile):
	"""Displaying gensim topic models"""
    ## Load files from "gensim_modeling"
   	corpus = corpora.MmCorpus(corpusfile)
   	dictionary = corpora.Dictionary.load(dictionaryfile) # for pyLDAvis
   	myldamodel = ldamodel.LdaModel.load(modelfile)    

    ## Interactive visualisation
   	vis = pyLDAvis.gensim.prepare(myldamodel, corpus, dictionary)
   	pyLDAvis.save_html(vis, visfile)

In [121]:
# dataFolderPath = "/home/lahiru/Research/policy_analysis/data/usableprivacy/OptOutChoice-2017_v1.0/SegmentDict.json"
dataFolderPath = "/home/lahiru/Research/policy_analysis/data/gdpr_doc/CELEX_32016R0679_EN_TXT.txt"
# policySegments = parsedPolicySegments(dataFolderPath)
# policySegments = readTxtFile(dataFolderPath)

In [122]:
# clean_docs = cleanDocs(policySegments)
#clean_docs_sentences = cleanDocsAsSentences(policySegments)

In [123]:
#term_frequence = findTermFrequence(clean_docs_sentences)
#term_frequence.to_csv('term_frequence_sentence.csv', sep=',')

In [124]:
modelname = 'topic_10_gdpr_only_nouns_n_verb_run_1'
# buildLADModel(clean_docs, modelname)

In [125]:
loadModel(modelname + '.model')

2019-11-11 20:23:38,194 : INFO : loading LdaModel object from topic_10_gdpr_only_nouns_n_verb_run_1.model


IOError: [Errno 2] No such file or directory: 'topic_10_gdpr_only_nouns_n_verb_run_1.model'

In [None]:
# visulaizeModel(modelname + '.mm', modelname + '.dict', modelname + '.model', modelname + '.html')