In [14]:
import pdfplumber
import csv
import re

# Function to extract course details
def extract_course_details(text):
    match = re.search(r'([A-Z]{2,4}\s\d{4})\.\s(.*?)(?=\.\s\()', text)
    if match:
        return match.groups()
    return None, None

# Function to find sentences containing the keywords
def find_sentences(text, keyword):
    pattern = re.compile(r'([^.]*?{}[^.]*\.)'.format(re.escape(keyword)), re.IGNORECASE)
    return pattern.findall(text)

# Function to process the PDF and write the data to a CSV file
def process_pdf(pdf_path, keywords, csv_path):
    with pdfplumber.open(pdf_path) as pdf:
        courses = []
        for page_number, page in enumerate(pdf.pages, 1):
            print(f"Processing page {page_number}/{len(pdf.pages)}...")
            text = page.extract_text() or ''
            for paragraph in text.split('\n'):
                course_code, course_name = extract_course_details(paragraph)
                if course_code and course_name:
                    course_info = {'course_code': course_code, 'course_name': course_name, 'keywords': set(), 'sentences': []}
                    for keyword in keywords:
                        sentences = find_sentences(paragraph, keyword)
                        if sentences:
                            course_info['keywords'].add(keyword)
                            course_info['sentences'].extend(sentences)
                    if course_info['sentences']:
                        courses.append(course_info)

    # Write the extracted data to a CSV file
    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=['course_code', 'course_name', 'keywords', 'sentences'])
        writer.writeheader()
        for course in courses:
            writer.writerow({
                'course_code': course['course_code'],
                'course_name': course['course_name'],
                'keywords': ', '.join(course['keywords']),
                'sentences': ' | '.join(course['sentences'])
            })

# Define your keywords
keywords = [
    'Climate', 'Ecologic', 'Ecological', 'Environment', 'Environmental', 'Environmentalism',
    'Fossil Fuel', 'Global Warming', 'Natural', 'Nature', 'Resilience', 'Resilient',
    'Sustainability', 'Sustainable', 'Energy', 'Renewable', 'Solar', 'Wind',
    'Alternative Transportation', 'Biodiversity', 'Conservation', 'Consumption',
    'Contamination', 'Deforestation', 'Eco-conscious', 'Ecoliteracy', 'Ecosystem',
    'Green building', 'Greenhouse', 'Land management', 'Marine', 'Native species',
    'Pollution', 'Preservation', 'Recycling', 'Waste', 'Water', 'Wildlife', 'Land Use',
    'Social Justice', 'Disparities', 'Equality', 'Equitable', 'Food security', 'Food system',
    'Food waste', 'Human rights', 'Hunger', 'Inequalities', 'Inequity', 'Poverty', 'Racial',
    'Racism', 'Reproductive rights', 'Social change', 'Justice'
]

# Specify the PDF path and CSV path
pdf_path = '/Users/manoh/Downloads/Northeastern University 2022-2023 Course Descriptions.pdf'
csv_path = 'keywords_in_Catalogue.csv'

# Process the PDF
process_pdf(pdf_path, keywords, csv_path)


Processing page 1/886...
Processing page 2/886...
Processing page 3/886...
Processing page 4/886...
Processing page 5/886...
Processing page 6/886...
Processing page 7/886...
Processing page 8/886...
Processing page 9/886...
Processing page 10/886...
Processing page 11/886...
Processing page 12/886...
Processing page 13/886...
Processing page 14/886...
Processing page 15/886...
Processing page 16/886...
Processing page 17/886...
Processing page 18/886...
Processing page 19/886...
Processing page 20/886...
Processing page 21/886...
Processing page 22/886...
Processing page 23/886...
Processing page 24/886...
Processing page 25/886...
Processing page 26/886...
Processing page 27/886...
Processing page 28/886...
Processing page 29/886...
Processing page 30/886...
Processing page 31/886...
Processing page 32/886...
Processing page 33/886...
Processing page 34/886...
Processing page 35/886...
Processing page 36/886...
Processing page 37/886...
Processing page 38/886...
Processing page 39/88