In [21]:
import os
from PyPDF2 import PdfReader
import re
import spacy

nlp = spacy.load("en_core_web_sm")

import nltk
from nltk.util import ngrams
import json
import csv

In [6]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        text = ""
        for page_num in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
    return text

In [7]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

ocr_not_required_dir = "../data/OCR_Not_Required"
pdf_files = [file for file in os.listdir(ocr_not_required_dir) if file.endswith(".pdf")]

pdf_texts = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(ocr_not_required_dir, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    pdf_texts.append(text)

In [8]:
ocr_performed_dir = "../data/OCR_Performed"
text_files = [file for file in os.listdir(ocr_performed_dir) if file.endswith(".txt")]

ocr_texts = []
for text_file in text_files:
    text_path = os.path.join(ocr_performed_dir, text_file)
    with open(text_path, "r") as file:
        text = file.read()
        ocr_texts.append(text)

In [9]:
all_texts = []
all_texts.extend(pdf_texts)
all_texts.extend(ocr_texts)
print(f"Total number of documents loaded: {len(all_texts)}")

Total number of documents loaded: 19


In [10]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespaces and line breaks
    text = re.sub('\s+', ' ', text)
    
    return text.strip()

In [11]:
cleaned_texts = [clean_text(text) for text in all_texts]

In [12]:
print(cleaned_texts[0][:500])

1 automated speed enforcement program report 20142016 2 in 2013 the state legislature and governor cuomo enacted sec 1180b of new york states vehicle and traffic law vtl which granted new york city the authority to pilot an automated speed enforcement program to deter speeding in 20 school zones the first speed camera violation was issued in january 2014 in june 2014 the pilot was expanded to 140 school zones in or der to support the pursuit of the citys vision zero goal to eliminate traffic dea


In [13]:
def preprocess_text(text):
    doc = nlp(text)
    
    # Tokenization, stopword removal, and lemmatization
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct
    ]
    
    return " ".join(tokens)

In [14]:
preprocessed_texts = [preprocess_text(text) for text in cleaned_texts]

In [15]:
def generate_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    return list(ngrams(tokens, n))

In [16]:
# Define a function to extract bigrams
def extract_bigrams(text):
    bigrams = list(ngrams(text.split(), 2))
    bigram_phrases = [' '.join(bigram) for bigram in bigrams]
    return bigram_phrases

# Define a function to extract trigrams
def extract_trigrams(text):
    trigrams = list(ngrams(text.split(), 3))
    trigram_phrases = [' '.join(trigram) for trigram in trigrams]
    return trigram_phrases

# Extract bigrams and trigrams for each cleaned text
bigrams_list = []
trigrams_list = []

for cleaned_text in cleaned_texts:
    bigrams = extract_bigrams(cleaned_text)
    trigrams = extract_trigrams(cleaned_text)
    bigrams_list.append(bigrams)
    trigrams_list.append(trigrams)

# Check the first few bigrams and trigrams for the first document
print("Bigrams for the first document:", bigrams_list[0][:10])
print("Trigrams for the first document:", trigrams_list[0][:10])


Bigrams for the first document: ['1 automated', 'automated speed', 'speed enforcement', 'enforcement program', 'program report', 'report 20142016', '20142016 2', '2 in', 'in 2013', '2013 the']
Trigrams for the first document: ['1 automated speed', 'automated speed enforcement', 'speed enforcement program', 'enforcement program report', 'program report 20142016', 'report 20142016 2', '20142016 2 in', '2 in 2013', 'in 2013 the', '2013 the state']


In [22]:
with open("../data/Cleaned_Text/doris_cleaned_texts.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    for text in cleaned_texts:
        writer.writerow([text])

In [19]:
# Create the folder if it doesn't exist
os.makedirs("../data/Cleaned_Text", exist_ok=True)

with open("../data/Cleaned_Text/doris_cleaned_texts.json", "w") as outfile:
    json.dump(cleaned_texts, outfile)