In [60]:
#Import
import os as os
import fnmatch
import re
import glob
from io import StringIO
import numpy as np
import sys
import networkx as nx
import matplotlib.pyplot as plt
import random
from pathlib import Path

#Import DB Models
from murdochpolicyapp.models import Category,StopWord
from murdochpolicyapp.models import Document
from murdochpolicyapp.models import DocLink
from django.db.models import Q

#PDF Import
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
from pdfminer.psparser import PSLiteral, PSKeyword
from pdfminer.utils import decode_text

#NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Import all of the scikit learn stuff
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import scipy


# Set Paths

In [97]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DOC_DIR = os.path.join(BASE_DIR,'media','policy_documents')
STATIC_DIR = os.path.join(BASE_DIR,'static')
STOP_WORD_FILE = os.path.join(STATIC_DIR,'stop_words.txt')
IMG_DIR = os.path.join(STATIC_DIR,'images')
HOMEPAGE_MAP= os.path.join(IMG_DIR,'home_diagram.png')
DOCUMENT_MAP= os.path.join(IMG_DIR,'document_diagram.png')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

DOC_DIR

'/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents'

In [62]:
#Preprocessing - Convert lower case
def convert_lower_case(data):
    return np.char.lower(data)

In [79]:
#Find files in folder matching pattern case insensitive
def findfiles(which, where='.'):
    rule = re.compile(fnmatch.translate(which), re.IGNORECASE)
    return [where+'/'+name for name in os.listdir(where) if rule.match(name)]

In [80]:
def initStopWords():
     #Extend Stopwords
    policy_stop_words = []
    with open(STOP_WORD_FILE) as f:
      for line in f:
        policy_stop_words.extend(line.rstrip('\n').split(','))
    for word in policy_stop_words:
        stop_word = StopWord.objects.create(value=word)
        stop_word.save()

# Document Processing Functions

In [81]:
#Preprocessing - Remove Stop words
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    
    #Extend Stopwords
    policy_stop_words = []
    for word in StopWord.objects.all():
        policy_stop_words.append(word.value)
    
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in policy_stop_words  and w not in stop_words and len(w) > 3: #remove also the word with length <= 2
            new_text = new_text + " " + w
    return new_text


In [82]:
#Preprocessing - Remove Punctuation
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [83]:
#Preprocessing - Remove Apostrophe
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [84]:
#Preprocessing - Stemming
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [85]:
#Preprocessing - convert numbers
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [86]:
#Preprocessing Data
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [87]:
def remove_empty_lines(filename):
    #Overwrite the file, removing empty lines and lines that contain only whitespace.
    with open(filename, 'r+') as f:
        lines = f.readlines()
        f.seek(0)
        f.writelines(line for line in lines if line.strip())
        f.truncate()
        

In [88]:
def pdf_to_txt(pdfFileName):
    output_string = StringIO()
    txtFileName = pdfFileName[0:pdfFileName.rindex('.')]+'.txt'
    txt_file = open(txtFileName,"w+") 
    with open(pdfFileName, 'rb') as pdf_file:
        parser = PDFParser(pdf_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        txt_file.write(preprocess(output_string.getvalue()))
    txt_file.close()
    pdf_file.close()
    remove_empty_lines(txtFileName)

# Similarity Functions

In [89]:
def get_similarity(text1, text2):
    #Setup vector
    vectorizer = TfidfVectorizer(min_df=1,stop_words="english")
    corpus = [text1,text2]
    #Transform for pairwise similarity
    tfidf = vectorizer.fit_transform(corpus)
    pairwise_similarity = tfidf * tfidf.T 
    return pairwise_similarity[(0,1)]

In [90]:
def get_feature_words(text):
    vectorizer = TfidfVectorizer(min_df=1,stop_words="english")
    corpus = [text]
    tfidf = vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names()


In [57]:
doc = Document.objects.get(pk=306)
doc.feature_words=get_feature_words(Document.objects.get(pk=306).doc_text)

In [59]:
print(doc.feature_words)

['accept', 'access', 'accord', 'achiev', 'action', 'administr', 'advanc', 'affirm', 'aggress', 'alleg', 'alloc', 'amend', 'anticip', 'appear', 'applic', 'approv', 'assault', 'audienc', 'australia', 'author', 'behaviour', 'belief', 'belittl', 'breach', 'brunt', 'bulli', 'carri', 'case', 'chang', 'circumst', 'code', 'colour', 'comment', 'commiss', 'commit', 'common', 'commonli', 'commun', 'conduct', 'confer', 'confid', 'confidenti', 'conflict', 'consid', 'constantli', 'constitut', 'consult', 'contact', 'copi', 'cover', 'creat', 'critic', 'cultur', 'deadlin', 'dealt', 'deci', 'defin', 'definit', 'degrad', 'delib', 'deliv', 'deni', 'describ', 'design', 'dictionari', 'differ', 'difficult', 'digniti', 'direct', 'director', 'disabl', 'disciplinari', 'discrimin', 'disturb', 'document', 'effect', 'electron', 'elimin', 'email', 'employ', 'engag', 'ensur', 'entitl', 'environ', 'equal', 'equip', 'ethnic', 'event', 'exampl', 'exhaust', 'expect', 'extent', 'extern', 'fair', 'famili', 'favour', 'fell

In [14]:
def draw_network_documents(docs,docLinks,img_path):
    G = nx.Graph(day="Stackoverflow")

    for doc in docs:
        G.add_node(doc.doc_name, group=doc.category.id, nodesize=doc.node_size)

    for link in docLinks:       
        doc1 = Document.objects.get(pk=link.source)
        doc2 = Document.objects.get(pk=link.target)
        G.add_weighted_edges_from([(doc1.doc_name, doc2.doc_name, link.value)])
    
    cats = Category.objects.all()
    color_map = {}
    colors = ['#f09494', '#eebcbc', '#72bbd0', '#91f0a1', '#629fff','#bcc2f2','#eebcbc', 
              '#f1f0c0', '#d2ffe7', '#caf3a6', '#ffdf55', '#ef77aa', '#d6dcff', '#d2f5f0']
    for i in range(len(cats)):
        if i<len(colors) :
            color_map[cats[i].id] = colors[i] 
        else:
            color_map[cats[i].id] = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])

    plt.figure(figsize=(20,20))
    options = {
        'edge_color': '#FFDEA2',
        'width': 1,
        'with_labels': True,
        'font_weight': 'regular',
    }
  
    nodeColors = [color_map[G.nodes[node]['group']] for node in G]
    sizes = [G.nodes[node]['nodesize']/10 for node in G]

    nx.draw(G, node_color=nodeColors, node_size=sizes, pos=nx.spring_layout(G, k=0.45, iterations=30), **options)
    ax = plt.gca()
    ax.collections[0].set_edgecolor("#555555") 
    plt.savefig(img_path)
    plt.show()

In [15]:
def draw_doc_relationship(doc):
    docs = []
    docs.append(doc)
    for link in DocLink.objects.filter(source__exact=doc.id):
        target = Document.objects.get(pk=link.target)
        docs.append(target)
    for link in DocLink.objects.filter(target__exact=doc.id):
        source = Document.objects.get(pk=link.source)
        docs.append(source)
        
    docLinks = DocLink.objects.filter(Q(source__exact=doc.id) | Q(target__exact=doc.id))

    draw_network_documents(docs,docLinks,DOCUMENT_MAP)
    

# Retrieve Folders and Documents

In [98]:
folders = [x[0] for x in os.walk(DOC_DIR)]
folders

['/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Health Safety & Environment',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Physical Facilities',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/People Culture',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Learning Teaching',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Information Technology',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Community Development',
 '/Users/pamelapham/Documents/Mur

In [99]:
for i in range(1,len(folders)):
    for d in glob.glob(folders[i]+"/*.pdf"):
        print(d)

/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/research_misconduct_policy_v1.pdf
/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/graduate_research_degrees_joint_doctoral_degree_research_policy_v3.pdf
/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/human_research_ethics_policy_v4.pdf
/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/animal_ethics_policy_v2.pdf
/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/responsible_conduct_of_research_policy_v2.pdf
/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Health Safety & Environment/environmental_sustainability_policy.pdf
/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_po

In [103]:
for i in range(1,len(folders)):
    for txtDoc in findfiles("*.pdf",folders[i]):
        docname=txtDoc[txtDoc.rindex('/')+1:txtDoc.rindex('.')]
        path = txtDoc[0:txtDoc.rindex('/')]
        #print(docname)
        print(path)

//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//


In [40]:
#Find the PDF docs and process to txt
txtDocs=[]
for i in range(1,2):
    for d in findfiles("*.pdf",folders[i]):
        doc = pdf_to_txt(d) 
        txtDocs.append(doc)

<_io.TextIOWrapper name='/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/policy_documents/Research/Graduate Research Degrees Joint Doctoral Degree Research Policy V3.txt' mode='w+' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/policy_documents/Research/Human Research Ethics Policy V4.txt' mode='w+' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/policy_documents/Research/Animal Ethics Policy V2.txt' mode='w+' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/policy_documents/Research/Responsible Conduct Of Research Policy V2.txt' mode='w+' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/policy_documents/Research/Research Misconduct Policy V1.txt' mode='w+' encoding='UTF-8'>


# Cleanup

In [None]:
DocLink.objects.all().delete()
Document.objects.all().delete()
Category.objects.all().delete()
StopWord.objects.all().delete()

# Create Categories

In [None]:
#Create base on the folders name
for i in range(1,len(folders)):
    catname=folders[i][folders[i].rindex('/')+1::]
    cat = Category.objects.create(category_name=catname)
    cat.save()

# Create Documents

In [None]:
for i in range(1,len(folders)):
    for txtDoc in glob.glob(folders[i]+"/*.txt"):
        with open(txtDoc, 'r') as f:
            docname=txtDoc[txtDoc.rindex('/')+1:txtDoc.rindex('.')]
            catname=folders[i][folders[i].rindex('/')+1::]
            cat = Category.objects.get(category_name__exact=catname)
            text = f.readline().rstrip()
            nodeSize = len(text)
            doc = Document.objects.create(doc_name=docname,category=cat,node_size=nodeSize,doc_text=text,feature_words)
            doc.save()

# Create DocLinks

In [None]:
docs = Document.objects.all()
alpha = 0.3
for i in range(len(docs)):    
    for j in range(len(docs)):
        if i<j:
            similarity = get_similarity(docs[i].doc_text, docs[j].doc_text)
            if(similarity>alpha):
                doclink = DocLink.objects.create(source=docs[i].id,target=docs[j].id,value=similarity)
                doclink.save()

# Redraw the Documents Network Graph

In [None]:
docLinks = DocLink.objects.all()


In [None]:
draw_network_documents(docs,docLinks,HOMEPAGE_MAP)

In [None]:
draw_doc_relationship(docs[8])