In [13]:
#Import
import os
import fnmatch
import re
import glob
from io import StringIO
import numpy as np
import sys
import networkx as nx
import matplotlib.pyplot as plt
import random
from django.core.files import File

#Import DB Models
from murdochpolicyapp.models import Category,DocLink,Document,StopWord
from django.db.models import Q

#PDF Import
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
from pdfminer.psparser import PSLiteral, PSKeyword
from pdfminer.utils import decode_text

#NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Import all of the scikit learn stuff
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import scipy

from murdoch_policy import settings

os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
plt.switch_backend('Agg')
def refreshStopWords():
    #clean StopWords
    StopWord.objects.all().delete()

     #Extend Stopwords
    policy_stop_words = []
    with open(settings.STOP_WORD_FILE) as f:
      for line in f:
        policy_stop_words.extend(line.rstrip('\n').split(','))
    for word in policy_stop_words:
        w, created = StopWord.objects.get_or_create(value=word)
        if created:
            w.save()

#Find files in folder matching pattern case insensitive
def findfiles(which, where='.'):
    rule = re.compile(fnmatch.translate(which), re.IGNORECASE)
    return [where+'/'+name for name in os.listdir(where) if rule.match(name)]

#Preprocessing - Convert lower case
def convert_lower_case(data):
    return np.char.lower(data)

#Preprocessing - Remove Stop words
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    
    #Extend Stopwords
    policy_stop_words = []
    for word in StopWord.objects.all():
        policy_stop_words.append(word.value)
    
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in policy_stop_words  and w not in stop_words and len(w) > 3: #remove also the word with length <= 2
            new_text = new_text + " " + w
    return new_text

#Preprocessing - Remove Punctuation
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

#Preprocessing - Remove Apostrophe
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


#Preprocessing - Stemming
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

#Preprocessing - convert numbers
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

#Preprocessing Data
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

def remove_empty_lines(filename):
    #Overwrite the file, removing empty lines and lines that contain only whitespace.
    with open(filename, 'r+') as f:
        lines = f.readlines()
        f.seek(0)
        f.writelines(line for line in lines if line.strip())
        f.truncate()

def pdf_to_txt(pdfFileName):
    output_string = StringIO()
    txtFileName = pdfFileName[0:pdfFileName.rindex('.')]+'.txt'
    txt_file = open(txtFileName,"w+") 
    with open(pdfFileName, 'rb') as pdf_file:
        parser = PDFParser(pdf_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        txt_file.write(preprocess(output_string.getvalue()))
    txt_file.close()
    pdf_file.close()
    remove_empty_lines(txtFileName)
    return txtFileName

def get_similarity(text1, text2):
    #Setup vector
    vectorizer = TfidfVectorizer(min_df=1,stop_words="english")
    corpus = [text1,text2]
    #Transform for pairwise similarity
    tfidf = vectorizer.fit_transform(corpus)
    pairwise_similarity = tfidf * tfidf.T 
    return pairwise_similarity[(0,1)]

def get_feature_words(text):
    vectorizer = TfidfVectorizer(min_df=1,stop_words="english")
    corpus = [text]
    tfidf = vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names()

In [25]:
def draw_network_documents(docs,docLinks,img_path):
    G = nx.Graph(day="Stackoverflow")

    for doc in docs:
        G.add_node(doc.doc_name, group=doc.category.id, nodesize=doc.node_size)

    for link in docLinks:       
        doc1 = Document.objects.get(pk=link.source)
        doc2 = Document.objects.get(pk=link.target)
        G.add_weighted_edges_from([(doc1.doc_name, doc2.doc_name, link.value)])
    
    cats = Category.objects.all()
    color_map = {}
    colors = [  '#eebcbc', '#72bbd0', '#91f0a1', '#629fff','#bcc2f2','#eebcbc', '#caf3a6',
              '#f09494', '#f1f0c0', '#d2ffe7', '#ffdf55', '#ef77aa', '#d6dcff', '#d2f5f0']
    for i in range(len(cats)):
        if i<len(colors) :
            color_map[cats[i].id] = colors[i] 
        else:
            color_map[cats[i].id] = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])

    plt.figure(figsize=(20,20))
    options = {
        'edge_color': '#FFDEA2',
        'width': 1,
        'with_labels': True,
        'font_weight': 'regular',
    }
  
    nodeColors = [color_map[G.nodes[node]['group']] for node in G]
    sizes = [G.nodes[node]['nodesize']/10 for node in G]

    nx.draw(G, node_color=nodeColors, node_size=sizes, pos=nx.spring_layout(G, k=0.45, iterations=30), **options)
    ax = plt.gca()
    ax.collections[0].set_edgecolor("#555555") 
    plt.savefig(img_path)
    #plt.show()

def draw_doc_relationship(doc):
    docs = []
    docs.append(doc)
    for link in DocLink.objects.filter(source__exact=doc.id):
        target = Document.objects.get(pk=link.target)
        docs.append(target)
    for link in DocLink.objects.filter(target__exact=doc.id):
        source = Document.objects.get(pk=link.source)
        docs.append(source)
        
    docLinks = DocLink.objects.filter(Q(source__exact=doc.id) | Q(target__exact=doc.id))
    draw_network_documents(docs,docLinks,settings.DOCUMENT_MAP)

def getRelatedDoc(doc):
    docs = []
    for link in DocLink.objects.filter(source__exact=doc.id):
        target = Document.objects.get(pk=link.target)
        docs.append([target.doc_name,link.value])
    for link in DocLink.objects.filter(target__exact=doc.id):
        source = Document.objects.get(pk=link.source)
        docs.append([source.doc_name,link.value])
    return docs


def refreshAll():
    #Folders
    folders = [x[0] for x in os.walk(settings.DOC_DIR)]
    
    #Find the PDF docs and process to txt
    txtDocs=[]
    for i in range(1,len(folders)):
        for d in findfiles("*.pdf",folders[i]):
            doc = pdf_to_txt(d) 
            txtDocs.append(doc)
    #Clean up all tables
    DocLink.objects.all().delete()
    Document.objects.all().delete()
    Category.objects.all().delete()
    

    #Init StopWords
    refreshStopWords()

    #Create base on the folders name
    for i in range(1,len(folders)):
        catname=folders[i][folders[i].rindex('/')+1::]
        cat = Category.objects.create(category_name=catname)
        cat.save()

    #Create Documents
    for i in range(1,len(folders)):
        for txtDoc in findfiles("*.txt",folders[i]):
            with open(txtDoc, 'r') as f:
                docname=txtDoc[txtDoc.rindex('/')+1:txtDoc.rindex('.')]
                catname=folders[i][folders[i].rindex('/')+1::]
                docfile = str(os.path.join('policy_documents',catname,docname+'.pdf'))
                cat = Category.objects.get(category_name__exact=catname)
                text = f.readline().rstrip()
                feature_words = get_feature_words(text)
                nodeSize = len(text)
                doc = Document.objects.create(doc_name=docname,category=cat,node_size=nodeSize,doc_text=text,feature_words=feature_words,docfile=docfile)
                doc.save()

    #Create DocLinks
    docs = Document.objects.all()
    alpha = 0.3
    for i in range(len(docs)):    
        for j in range(len(docs)):
            if i<j:
                similarity = get_similarity(docs[i].doc_text, docs[j].doc_text)
                if(similarity>alpha):
                    doclink = DocLink.objects.create(source=docs[i].id,target=docs[j].id,value=similarity)
                    doclink.save()
    #Redraw the documents network graph
    docLinks = DocLink.objects.all()
    print(len(docs))
    print(len(docLinks))
    draw_network_documents(docs,docLinks,settings.HOMEPAGE_MAP)

#Create DocLinks
def createDocLinks(doc_obj):
    docs = Document.objects.all()
    alpha = 0.3
    for i in range(len(docs)):    
        if(doc_obj.id!=docs[i].id):
            similarity = get_similarity(docs[i].doc_text, doc_obj.doc_text)
            if(similarity>alpha):
                doclink = DocLink.objects.create(source=docs[i].id,target=doc_obj.id,value=similarity)
                doclink.save()

In [5]:
#Folders
folders = [x[0] for x in os.walk(settings.DOC_DIR)]
folders

['/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Health Safety & Environment',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Physical Facilities',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/People Culture',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Learning Teaching',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Information Technology',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Student Experience',
 '/Users/pamelapham/Documents/Murdoc

In [7]:
 #Find the PDF docs and process to txt
txtDocs=[]
for i in range(1,len(folders)):
    for d in findfiles("*.pdf",folders[i]):
        doc = pdf_to_txt(d) 
        txtDocs.append(doc)
txtDocs

['/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/Graduate Research Degrees Joint Doctoral Degree Research Policy V3.txt',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/Human Research Ethics Policy V4.txt',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/Animal Ethics Policy V2.txt',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/Responsible Conduct Of Research Policy V2.txt',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Research/Research Misconduct Policy V1.txt',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/murdoch_policy/media/policy_documents/Health Safety & Environment/Immunisation Policy.txt',
 '/Users/pamelapham/Documents/Murdoch/ICT302/Final_year_project/

In [8]:
#Clean up all tables
DocLink.objects.all().delete()
Document.objects.all().delete()
Category.objects.all().delete()

(10, {'murdochpolicyapp.Category': 10})

In [9]:
#Init StopWords
refreshStopWords()

In [10]:
#Create base on the folders name
for i in range(1,len(folders)):
    catname=folders[i][folders[i].rindex('/')+1::]
    cat = Category.objects.create(category_name=catname)
    cat.save()

In [16]:
#Create Documents
for i in range(1,len(folders)):
    for txtDoc in findfiles("*.txt",folders[i]):
        with open(txtDoc, 'r') as f:
            docname=txtDoc[txtDoc.rindex('/')+1:txtDoc.rindex('.')]
            catname=folders[i][folders[i].rindex('/')+1::]
            cat = Category.objects.get(category_name__exact=catname)
            text = f.readline().rstrip()
            feature_words = get_feature_words(text)
            nodeSize = len(text)
            doc = Document.objects.create(doc_name=docname,category=cat,node_size=nodeSize,doc_text=text,feature_words=feature_words)
            doc.save()


In [17]:
#Create DocLinks
def createDocLinks(doc_obj):
    docs = Document.objects.all()
    alpha = 0.3
    for i in range(len(docs)):    
        if(doc_obj.id!=docs[i].id):
            similarity = get_similarity(docs[i].doc_text, doc_obj.doc_text)
            if(similarity>alpha):
                doclink = DocLink.objects.create(source=docs[i].id,target=doc_obj.id,value=similarity)
                doclink.save()

In [18]:
#Create DocLinks
docs = Document.objects.all()
alpha = 0.3
for i in range(len(docs)):    
    for j in range(len(docs)):
        if i<j:
            similarity = get_similarity(docs[i].doc_text, docs[j].doc_text)
            if(similarity>alpha):
                doclink = DocLink.objects.create(source=docs[i].id,target=docs[j].id,value=similarity)
                doclink.save()
#Redraw the documents network graph
docLinks = DocLink.objects.all()
print(len(docs))
print(len(docLinks))
draw_network_documents(docs,docLinks,settings.HOMEPAGE_MAP)

161
1182


In [26]:
docs = Document.objects.all()
docLinks = DocLink.objects.all()
draw_network_documents(docs,docLinks,settings.HOMEPAGE_MAP)

In [22]:
cats = Category.objects.all()
cats

<QuerySet [<Category: Student Experience>, <Category: Research>, <Category: Physical Facilities>, <Category: People Culture>, <Category: Learning Teaching>, <Category: Information Technology>, <Category: Health Safety & Environment>, <Category: Governance>, <Category: Finance Purchasing Insurance>, <Category: Community Development>]>