In [1]:
import pandas as pd
import regex as re
import os, sys
import xml.etree.ElementTree as et
import PyPDF2

path = "data-transcripts" ###


In [2]:

# a function to walk through all files in a folder and its subfolders
def list_files(dir):                                                                                                  
    r = []                                                                                                            
    subdirs = [x[0] for x in os.walk(dir)]                                                                            
    for subdir in subdirs:                                                                                            
        files = os.walk(subdir).__next__()[2]                                                                             
        if (len(files) > 0):                                                                                          
            for file in files:                                                                                        
                r.append(os.path.join(subdir, file))                                                                         
    return r


In [4]:
# directory = os.path.join(sys.path[0], path + "\\raw") ### INPUT FOLDER HERE ###
# files_in_dir = list_files(directory)
# files_in_dir

['c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1004063_T.pdf',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1004063_T.xml',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\10103972_T.pdf',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\10103972_T.xml',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012763_T.pdf',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012763_T.xml',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012782_T.pdf',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012782_T.xml',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012785_T.pdf',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012785_T.xml',
 'c:\\Users\\hkpu\\Projects\\index-construction\\data-transcripts\\raw\\1012787_T.pdf',
 'c:\\Users\\hkpu\\Projects\\i

In [3]:

# processing xml
def read_xml(document, index=0):

  # creating a pdf file object
  tree = et.parse(document)
  root = tree.getroot()
  
  xmlstr = et.tostring(root, method='xml', encoding='unicode')
  if re.findall("<companyName>", xmlstr):
    # print('File is sound.')

    # add filename to dataframe
    m = re.search(r"\\(?!.*\\)(.*)((\.xml))", document)
    filename_match = m.group(0)
    filename = re.sub("[\\\/\'\>]", "", filename_match)
    filename = re.sub("\.\w+", "", filename)
    df.loc[index, 'file'] = filename
    
    print("Preprocessing metadata of", filename, "...")

    # get id
    df.loc[index, 'id'] = root[0].attrib['Id']
    
    # get company
    for company in root.findall('companyName'):
      df.loc[index, 'company'] = str(company.text)
    # get ticker
    for ticker in root.findall('companyTicker'):
      df.loc[index, 'ticker'] = str(ticker.text)
    # get date
    for date in root.findall('startDate'):
      df.loc[index, 'date'] = str(date.text)

    # get year
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['year'] = df['year'].astype(int)
  else:
    print("### ERROR ### Missing data in", document)
  
  return


In [4]:

# assign relative directory
directory = os.path.join(sys.path[0], path + "\\raw") ### INPUT FOLDER HERE ###

# list files in directory
files_in_dir = list_files(directory)

# count files in directory
print("Your input directory is:", directory, "\nNumber of files:", len(files_in_dir))

# initialize dataframe to hold documents
df = pd.DataFrame(columns=['file'])

# iterate over files in the directory
i = 0
for f in files_in_dir:
    if f.lower().endswith('.xml'):
        read_xml(f,i)
    i = i + 1

# df.reset_index(inplace=True, drop=True)

# Metadata from dataframe as csv
df = df[['id', 'company', 'year']]
df.columns = ['document_id', 'firm_id', 'time']
df.reset_index(drop=True, inplace=True)
df.to_csv(path + '\\input\\id2firms.csv', index=False)

with open(path + "\\input\\document_ids.txt", "w") as f_out:
    f_out.write("\n".join(df["document_id"]))

print("Done with metadata.")


Your input directory is: c:\Users\hkpu\Projects\index-construction\data-transcripts\raw 
Number of files: 75000
Preprocessing metadata of 1004063_T ...
Preprocessing metadata of 10103972_T ...
Preprocessing metadata of 1012763_T ...
Preprocessing metadata of 1012782_T ...
Preprocessing metadata of 1012785_T ...
Preprocessing metadata of 1012787_T ...
Preprocessing metadata of 1020948_T ...
Preprocessing metadata of 10221377_T ...
Preprocessing metadata of 10229958_T ...
Preprocessing metadata of 10231130_T ...
Preprocessing metadata of 10276652_T ...
Preprocessing metadata of 10305562_T ...
Preprocessing metadata of 10341258_T ...
Preprocessing metadata of 10353489_T ...
Preprocessing metadata of 10354189_T ...
Preprocessing metadata of 1037620_T ...
Preprocessing metadata of 10382650_T ...
Preprocessing metadata of 10406938_T ...
Preprocessing metadata of 10418964_T ...
Preprocessing metadata of 10426026_T ...
Preprocessing metadata of 1048657_T ...
Preprocessing metadata of 10550671_

KeyboardInterrupt: 

In [5]:

#####################################################

# processing pdf to text
def read_pdf(document, index=0):

  # add filename to dataframe
  m = re.search(r"\\(?!.*\\)(.*)((\.pdf))", document)
  filename_match = m.group(0)
  filename = re.sub("[\\\/\'\>]", "", filename_match)
  filename = re.sub("\.\w+", "", filename)
  df.loc[index, 'file'] = filename
  
  print("Preprocessing", filename, "...")

  # creating a pdf file object
  pdf = open(document, 'rb') 

  # creating a pdf reader object 
  reader = PyPDF2.PdfReader(pdf, strict=False) 
      
  # printing number of pages in pdf file 
  # print("Number of pages:", pdfReader.numPages)
  pages = len(reader.pages)

  # creating a page object 
  page = reader.pages[0]

  # extracting text from page 
  # print(pageObj.extractText())

  pages_with_contents = []

  for p in range(pages):
    page = reader.pages[p]
    page_contents = page.extract_text()
    pages_with_contents.append(page_contents)

  # join pages into one document
  contents = " ".join(pages_with_contents)

  #closing the pdf file object 
  pdf.close() 

  # cleaning
  contents = re.sub("\n", " ", contents)
  contents = re.sub("\r", " ", contents)
  contents = re.sub("\t", " ", contents)

  # remove symbols
  # contents = re.sub(r"[^a-zA-Z0-9]", " ", contents)

  # # lowercase
  # contents = contents.lower()

  #remove extra spaces
  contents = re.sub("\s+", " ", contents)

  # add contents to dataframe
  df.loc[index, 'content'] = contents

  return


Preprocessing 1004063_T ...
Preprocessing 10103972_T ...
Preprocessing 1012763_T ...
Preprocessing 1012782_T ...
Preprocessing 1012785_T ...
Preprocessing 1012787_T ...
Preprocessing 1020948_T ...
Preprocessing 10221377_T ...
Preprocessing 10229958_T ...
Preprocessing 10231130_T ...
Preprocessing 10276652_T ...
Preprocessing 10305562_T ...
Preprocessing 10341258_T ...
Preprocessing 10353489_T ...
Preprocessing 10354189_T ...
Preprocessing 1037620_T ...
Preprocessing 10382650_T ...
Preprocessing 10406938_T ...
Preprocessing 10418964_T ...
Preprocessing 10426026_T ...
Preprocessing 1048657_T ...
Preprocessing 10550671_T ...
Preprocessing 1065361_T ...
Preprocessing 1065377_T ...
Preprocessing 10655690_T ...
Preprocessing 10659595_T ...
Preprocessing 10679258_T ...
Preprocessing 10687439_T ...
Preprocessing 10689602_T ...
Preprocessing 10724780_T ...
Preprocessing 10733834_T ...
Preprocessing 10748343_T ...
Preprocessing 10749089_T ...
Preprocessing 10749091_T ...


KeyboardInterrupt: 

In [None]:

# initialize dataframe to hold documents
df = pd.DataFrame(columns=['file'])

# iterate over files in the directory
i = 0
j = 0
for f in files_in_dir:
    if f.lower().endswith('.pdf'):
        read_pdf(f,i)
        j = j + 1
    i = i + 1

df.reset_index(inplace=True)

# Sanity check
# df['content'][0]

# Documents from dataframe as txt
print("Creating documents.txt...")
documents = ""
for index, row in df.iterrows():
    document_string = row['content']
    # if index == j-1:
    #   documents = documents + document_string
    # else:
    documents = documents + document_string + '\n'

with open(path + '\\input\\documents.txt', 'w') as f:
    f.write(documents)
