In [None]:
#Import
import os as os
import glob
from io import StringIO
import numpy as np
import pandas as pd
import sys

#PDF Import
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1
from pdfminer.psparser import PSLiteral, PSKeyword
from pdfminer.utils import decode_text

#NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [None]:
#Preprocessing - Convert lower case
def convert_lower_case(data):
    return np.char.lower(data)

In [None]:
print(os.getcwd())

In [42]:
#Preprocessing - Remove Stop words
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    
    #Extend Stopwords
    sw_filename = os.get'/Users/pamelapham/Documents/Murdoch/ICT302/CoreProgram/stop_words.txt'
    policy_stop_words = []
    with open(sw_filename) as f:
      for line in f:
        policy_stop_words.extend(line.rstrip('\n').split(','))
    
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in policy_stop_words  and w not in stop_words and len(w) > 3: #remove also the word with length <= 2
            new_text = new_text + " " + w
    return new_text


In [43]:
#Preprocessing - Remove Punctuation
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [44]:
#Preprocessing - Remove Apostrophe
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [45]:
#Preprocessing - Stemming
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [46]:
#Preprocessing - convert numbers
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [47]:
#Preprocessing Data
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [48]:
def remove_empty_lines(filename):
    #Overwrite the file, removing empty lines and lines that contain only whitespace.
    with open(filename, 'r+') as f:
        lines = f.readlines()
        f.seek(0)
        f.writelines(line for line in lines if line.strip())
        f.truncate()
        

In [49]:
def pdf_to_txt(pdfFileName):
    output_string = StringIO()
    txtFileName = pdfFileName[0:pdfFileName.rindex('.')]+'.txt'
    print(txtFileName)
    txt_file = open(txtFileName,"w+") 
    with open(pdfFileName, 'rb') as pdf_file:
        parser = PDFParser(pdf_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        txt_file.write(preprocess(output_string.getvalue()))
    txt_file.close()
    pdf_file.close()
    remove_empty_lines(txtFileName)

In [50]:
#Collection the Folders
folders = [x[0] for x in os.walk('/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents')]

In [51]:
#Collection the PDF Docs
txtDocs=[]
for i in range(1,len(folders)):
    for d in glob.glob(folders[i]+"/*.pdf"):
        doc = pdf_to_txt(d) 
        txtDocs.append(doc)

/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/research_misconduct_policy_v1.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/graduate_research_degrees_joint_doctoral_degree_research_policy_v3.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/human_research_ethics_policy_v4.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/animal_ethics_policy_v2.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/responsible_conduct_of_research_policy_v2.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/learning_teaching/honours_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/learning_teaching/admissions_coursework_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/learning_teaching/enrolments_courseworl_procedure.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/learning_teaching/concurrent_studies_policy.txt
/Users/pam

/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/people_culture/workplace_bullying_and_harassment_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/fraud_and_corruption_control_plan.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/fee_regulations.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/establishment_of_undergraduate_honours_postgraduate_by_coursework_and_recruitment_scholarships_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/intellectual_property_regulations.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/award_of_honorary_degree_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/internationalisation_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/governance/award_nomenclature_certification_and_issuance_policy.txt
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Docume

In [60]:
csvFileName = '/Users/pamelapham/Documents/Murdoch/ICT302/CoreProgram/data/train_data.csv'
docNodeFileName = '/Users/pamelapham/Documents/Murdoch/ICT302/CoreProgram/data/doc_nodes.csv'

if os.path.isfile(csvFileName):
    os.remove(csvFileName)
if os.path.isfile(docNodeFileName):
    os.remove(docNodeFileName)
    
dataFile = open(csvFileName,"a+")
dataFile.write('name,category,text\n')

docNodeFile = open(docNodeFileName,"a+")
docNodeFile.write('name,category,nodesize\n')

for i in range(1,len(folders)):
    for txtDoc in glob.glob(folders[i]+"/*.txt"):
        print(txtDoc)
        with open(txtDoc, 'r') as f:
            docname=txtDoc[txtDoc.rindex('/')+1:txtDoc.rindex('.')]
            catname=folders[i][folders[i].rindex('/')+1::]
            print(docname)
            print(catname)
            text = f.readline().rstrip()
            dataFile.write(docname+','+catname+','+text+'\n')
            docNodeFile.write(docname+','+catname+','+str(len(text))+'\n')
dataFile.close()
docNodeFile.close()

/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/graduate_research_degrees_joint_doctoral_degree_research_policy_v3.txt
graduate_research_degrees_joint_doctoral_degree_research_policy_v3
research
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/research_misconduct_policy_v1.txt
research_misconduct_policy_v1
research
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/responsible_conduct_of_research_policy_v2.txt
responsible_conduct_of_research_policy_v2
research
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/animal_ethics_policy_v2.txt
animal_ethics_policy_v2
research
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/research/human_research_ethics_policy_v4.txt
human_research_ethics_policy_v4
research
/Users/pamelapham/Documents/Murdoch/ICT302/Policy_Documents/learning_teaching/assessment_procedure_2016august.txt
assessment_procedure_2016august
learning_teaching
/Users/pamelapham/Documents/Murdoch/