In [5]:
# Process one pdf:

In [6]:
import pdfplumber
import csv
import re
import os

In [7]:
def find_sentences(text, keyword):
    sentences = re.findall(r'([^.]*?'+re.escape(keyword)+r'[^.]*\.)', text, re.IGNORECASE)
    return "\n".join(sentences)

In [8]:
def extract_crn(text):
    # Match the first occurrence of a 5-digit number
    match = re.search(r'\b\d{5}\b', text)
    return match.group(0) if match else ''

In [9]:
def extract_course(text):
    # Match a pattern of 3 alphabets followed by a space, dash, or parentheses (optional space), then 4 digits
    match = re.search(r'\b[A-Za-z]{3}[\s\-()]*\d{4}\b', text)
    return match.group(0).replace('(', '').replace(')', '').strip() if match else ''

In [10]:
def extract_date(text):
    return 'date not extracted.'

In [11]:
def extract_desc(text):
    # This pattern matches "Course Description" followed by any non-alphanumeric characters 
    # (like a colon, space, or newline), requires at least one newline, 
    # and then captures the following text until a double newline or the end of the text.
    pattern = re.compile(r'Course Description[^\w]*\n+(\s*.*?)(\n\s*\n|$)', re.IGNORECASE | re.DOTALL)
    match = pattern.search(text)
    if match:
        # Extract the matched course description text
        course_description = match.group(1).strip()
        # Split the text into lines and limit to the first three lines
        course_description_lines = course_description.split('\n')[:3]
        # Join the first three lines back into a single string, separated by spaces
        course_description = ' '.join(line.strip() for line in course_description_lines)
        return course_description
    else:
        # If no match is found, return a default message
        return 'No description found.'

In [12]:
def keyowords_accumulator(keyword,keyword_dict):
    if keyword in keyword_dict:
        keyword_dict[keyword] += 1
    else:
        keyword_dict[keyword] = 1
    return keyword_dict

In [13]:
### Combining all this into one function
## Input -> one pdf path
## Output -> open csv file and edit it

In [14]:
def process_pdf(pdf_path,keywords,csv_path, keyword_dict):
    text = ''
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page
        for page in pdf.pages:
            # Extract text from the page
            text = text + page.extract_text()
            
    # Extract the crn
    crn = extract_crn(pdf_path)
    course = extract_course(pdf_path)
    desc = extract_desc(text)
    
    keywords_present = []
    keywords_not_present = []
    sentences = []

    # Split the text into sentences
    text_sentences = text.split('.')
    
    # Check each keyword's presence and collect sentences
    for keyword in keywords:
        keyword_lower = keyword.lower()
        found_sentences = [sentence.strip() + '.' for sentence in text_sentences if keyword_lower in sentence.lower()]

        if found_sentences:
            keywords_present.append(keyword)
            sentences.extend(found_sentences)
            keyword_dict = keyowords_accumulator(keyword, keyword_dict)
        else:
            keywords_not_present.append(keyword)

    # Join the lists into strings for CSV columns
    keywords_present_str = ' | '.join(keywords_present)
    keywords_not_present_str = ' | '.join(keywords_not_present)
    # Join the sentences with a '|' instead of a newline
    sentences_str = ' | '.join(sentences).replace('\n', ' | ')

    
    
    with open(csv_path, mode='a', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=['COURSE', 'CRN', 'DESCRIPTION', 'KEYWORDS_PRESENT', 'KEYWORDS_NOT_PRESENT', 'SENTENCES'])
        writer.writerow({
            'COURSE': course,
            'CRN': crn,
            'DESCRIPTION': desc,
            'KEYWORDS_PRESENT': keywords_present_str,
            'KEYWORDS_NOT_PRESENT': keywords_not_present_str,
            'SENTENCES': sentences_str
        })
        
    return csv_path, keyword_dict
    

In [15]:
## Process just one pdf

pdf_path = '/Users/manoh/Downloads/ALY6070.21136.202325.pdf'
csv_path = 'keywords_check.csv'
keywords = keywords = [
        'Climate', 'Ecologic', 'Ecological', 'Environment', 'Environmental', 'Environmentalism',
        'Fossil Fuel', 'Global Warming', 'Natural', 'Nature', 'Resilience', 'Resilient',
        'Sustainability', 'Sustainable', 'Energy', 'Renewable', 'Solar', 'Wind',
        'Environmental', 'Alternative Transportation', 'Biodiversity', 'Conservation',
        'Consumption', 'Contamination', 'Deforestation', 'Eco-conscious', 'Ecoliteracy',
        'Ecosystem', 'Green building', 'Greenhouse', 'Land management', 'Marine', 'Native species',
        'Pollution', 'Preservation', 'Recycling', 'Waste', 'Water', 'Wildlife', 'Land Use',
        'Social Justice', 'Disparities', 'Equality', 'Equitable', 'Food security', 'Food system',
        'Food waste', 'Human rights', 'Hunger', 'Inequalities', 'Inequity', 'Poverty', 'Racial',
        'Racism', 'Reproductive rights', 'Social change', 'Justice'
    ]

keyword_dict = {}

process_pdf(pdf_path,keywords,csv_path, keyword_dict)

('keywords_check.csv',
 {'Climate': 1,
  'Environment': 1,
  'Nature': 1,
  'Social Justice': 1,
  'Equitable': 1,
  'Justice': 1})

In [20]:
import os
import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError

def process_all_pdfs(folder_path, keywords, csv_path):
    # Check if the CSV file already exists and has content
    write_header = not os.path.exists(csv_path) or os.stat(csv_path).st_size == 0
    keywords_dict = {}
    # Write the header if the file is newly created or empty
    if write_header:
        with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=['COURSE','CRN','DESCRIPTION', 'KEYWORDS_PRESENT', 'KEYWORDS_NOT_PRESENT', 'SENTENCES'])
            writer.writeheader()

    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, filename)
                try:
                    print(f"Processing {pdf_path}...")
                    csv_path, keywords_dict = process_pdf(pdf_path, keywords, csv_path, keyword_dict)
                except PDFSyntaxError as e:
                    print(f"Error processing {pdf_path}: {e}. Skipping this file.")
    return keywords_dict

In [21]:
folder_path = '/Users/manoh/Downloads/PDFS_Project'
keywords = keywords = [
        'Climate', 'Ecologic', 'Ecological', 'Environment', 'Environmental', 'Environmentalism',
        'Fossil Fuel', 'Global Warming', 'Natural', 'Nature', 'Resilience', 'Resilient',
        'Sustainability', 'Sustainable', 'Energy', 'Renewable', 'Solar', 'Wind',
        'Environmental', 'Alternative Transportation', 'Biodiversity', 'Conservation',
        'Consumption', 'Contamination', 'Deforestation', 'Eco-conscious', 'Ecoliteracy',
        'Ecosystem', 'Green building', 'Greenhouse', 'Land management', 'Marine', 'Native species',
        'Pollution', 'Preservation', 'Recycling', 'Waste', 'Water', 'Wildlife', 'Land Use',
        'Social Justice', 'Disparities', 'Equality', 'Equitable', 'Food security', 'Food system',
        'Food waste', 'Human rights', 'Hunger', 'Inequalities', 'Inequity', 'Poverty', 'Racial',
        'Racism', 'Reproductive rights', 'Social change', 'Justice'
    ]
csv_path = 'keywords_check.csv'

with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=['COURSE','CRN','DESCRIPTION', 'KEYWORDS_PRESENT', 'KEYWORDS_NOT_PRESENT', 'SENTENCES'])
            writer.writeheader()
            
keywords_dict = process_all_pdfs(folder_path, keywords, csv_path)

Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /CHM1100.90049.202214.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /ACC 2300.30352.202214.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /BIO2100.90130.202214.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /ART1200.90270.202314.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /CHM2110.90347.202214.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /CHM1201.90372.202214.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /ITC2050.90434.202314.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /HMG2110.90040.202214.pdf...
Processing /Users/manoh/Downloads/PDFS_Project/Fall 2022 Undergraduate Semester  /ALY2100.90509.202314.pdf...
Processin

In [22]:
csv_file = 'keywords_count.csv'

# Write dictionary to CSV
with open(csv_file, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=keywords_dict.keys())
    
    # Write header
    writer.writeheader()
    
    # Write data
    writer.writerow(keywords_dict)