In [3]:
import csv

def add_category_column(input_file, output_file):
    with open(input_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)  # Read the header row
        data = list(reader)   # Read the remaining rows
        
        # Add the new column header and values to each row
        header.append('Category')
        for row in data:
            row.append('Courses')
        
        # Write the updated data back to the CSV file
        with open(output_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(header)
            writer.writerows(data)




In [11]:
import csv

# Specify the CSV file name
csv_file = "Commencement_QnA.csv"

# Specify the output text file names
questions_file = "Commencement_questions.txt"
answers_file = "Commencement_answers.txt"

# Read data from CSV and write questions and answers to separate text files
with open(csv_file, "r", newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    with open(questions_file, "w", encoding="utf-8") as q_file, open(answers_file, "w", encoding="utf-8") as a_file:
        for row in reader:
            question = row["Question"].strip()
            answer = row["Answer"].strip()
            q_file.write(question + "\n")
            a_file.write(answer + "\n")

print("Questions and answers extracted and saved to text files successfully.")


Questions and answers extracted and saved to text files successfully.


In [23]:
from bs4 import BeautifulSoup
import requests
import json
import os
import re
import fitz
# References
# https://beautiful-soup-4.readthedocs.io/en/latest/: Basic syntax
# ChatGPT 3.5
# Copy of /data_processing/scraping_and_beautify.py

"""
This function extracts the text and metadata part from a url and puts into text file.
Params:
    url = url of website
    id = document id you want assigned to the document
    topic_category = the category of the topic
    further_processing = function that performs further processing of text
Ensures:
    Creates a <topicCategory_docId.txt> document for the text of 
    the webpage and a <topicCategory_docId_metadata.txt> that contains the metadata and
    topic_category in /data/documents/.
Returns:
    Nothing
"""
def html_to_text(url, id, topic_category, further_processing):
    # Get response from url
    response = requests.get(url)

    if response.status_code==200:
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        base_dir = 'Academics_data/documents'
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        # Create doc txt file
        doc_file_name = os.path.join(base_dir, f'{topic_category}_{id}.txt')
        text = soup.get_text().strip()
        text = re.sub('\s+', ' ', text)
        # print("text: ", text)

        # text_elements = soup.find_all(text=True)

        # text = ' '.join(text_elements)

        # text = re.sub('\s+', ' ', text).strip()

        processed_text = further_processing(text)
        with open(doc_file_name,"w", encoding="utf-8") as file:
            file.write(processed_text)
        
        # Get metadata
       
        metadata_json = {}
        metadata_json['title']=soup.title.text
        metadata_tags = soup.find_all('meta')

        for tag in metadata_tags:
            name = tag.get('name')
            if name:
                metadata_json[name] = tag.get('content')
            else:
                property_attr = tag.get('property')
                if property_attr:
                    metadata_json[property_attr] = tag.get('content')
                    
        # Create metadata txt file
        meta_file_name = os.path.join(base_dir, f'{topic_category}_{id}_metadata.txt')
        with open(meta_file_name, "w", encoding="utf-8") as meta_file:
            json.dump(metadata_json, meta_file, indent=2)
            # Append the category information to the metadata file
            meta_file.write('\nTopic category is ' + topic_category)
            
def html_file_to_text(webpage, id, topic_category, further_processing):
    # Get response from url
    with open(webpage, 'r', encoding='utf-8') as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, 'html.parser')
    base_dir = 'Academics_data/documents'
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

        # Create doc txt file
    doc_file_name = os.path.join(base_dir, f'{topic_category}_{id}.txt')
    # text = soup.get_text().strip()
    # text = re.sub('\s+', ' ', text)
    # print("text: ", text)

    # text_elements = soup.find_all(text=True)

    # text = ' '.join(text_elements)

    # text = re.sub('\s+', ' ', text).strip()
    
    article_body_div = soup.find('div', class_='article-body')

    if article_body_div:
        article_body_text = article_body_div.get_text(separator='\n')

    text = '\n'.join(line.strip() for line in article_body_text.splitlines() if line.strip())

    processed_text = further_processing(text)
    with open(doc_file_name,"w", encoding="utf-8") as file:
        file.write(processed_text)
        
    metadata_json = {}
    metadata_json['title']=soup.title.text
    metadata_tags = soup.find_all('meta')
    for tag in metadata_tags:
        name = tag.get('name')
        if name:
            metadata_json[name] = tag.get('content')
        else:
            property_attr = tag.get('property')
            if property_attr:
                metadata_json[property_attr] = tag.get('content')
                    
        # Create metadata txt file
    meta_file_name = os.path.join(base_dir, f'{topic_category}_{id}_metadata.txt')
    with open(meta_file_name, "w", encoding="utf-8") as meta_file:
        json.dump(metadata_json, meta_file, indent=2)
        # Append the category information to the metadata file
        meta_file.write('\nTopic category is ' + topic_category)

def pdf_to_text(page, id, topic_category, further_processing):
    try:
        # Open the PDF file
        with fitz.open(page) as pdf_doc:
            text = ""

            # Iterate over pages
            for page_num in range(pdf_doc.page_count):
                # Get the page
                page = pdf_doc[page_num]

                # Extract text from the page
                text += page.get_text()

            # Specify the base directory
            base_dir = 'Academics_data/documents'

            # Create the base directory if it doesn't exist
            if not os.path.exists(base_dir):
                os.makedirs(base_dir)

            # Create the doc txt file
            doc_file_name = os.path.join(base_dir, f'{topic_category}_{id}.txt')

            # Process the text
            processed_text = further_processing(text)

            # Write the processed text to the doc txt file
            with open(doc_file_name, "w", encoding="utf-8") as file:
                file.write(processed_text)

            # Create metadata dictionary
            metadata_json = {'title': 'CMU fact sheet'}

            # Create the metadata file
            meta_file_name = os.path.join(base_dir, f'{topic_category}_{id}_metadata.txt')

            # Write metadata to the metadata file
            with open(meta_file_name, "w", encoding="utf-8") as meta_file:
                json.dump(metadata_json, meta_file, indent=2)

                # Append the category information to the metadata file
                meta_file.write('\nTopic category is ' + topic_category)

    except Exception as e:
        print(f"Error processing PDF: {str(e)}")


"""Add further processing as needed"""
def further_process(text):
    return text

    

In [13]:
pip install PyMuPDF

Collecting PyMuPDF
  Obtaining dependency information for PyMuPDF from https://files.pythonhosted.org/packages/cf/28/a50440fd3cdb263c1843bf166d48fc68d219ff7dccff7b854f19426ef4ee/PyMuPDF-1.23.26-cp311-none-win_amd64.whl.metadata
  Downloading PyMuPDF-1.23.26-cp311-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.22 (from PyMuPDF)
  Obtaining dependency information for PyMuPDFb==1.23.22 from https://files.pythonhosted.org/packages/a7/79/2822a5c60909fdacaa1bc455c91e2b2dec9fc79537860b538f09ccad229d/PyMuPDFb-1.23.22-py3-none-win_amd64.whl.metadata
  Downloading PyMuPDFb-1.23.22-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.23.26-cp311-none-win_amd64.whl (3.4 MB)
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.4 MB 495.5 kB/s eta 0:00:07
   --- ------------------------------------ 0.3/3.4 MB 2.5 MB/s eta 0:00:02
   --------

In [28]:
url = "https://lti.cs.cmu.edu/academics/phd-programs/files/handbook_phd_2023-2024.pdf"
id = "phd_handbook"
topic_category = "academics"
html_to_text(url, id, topic_category, further_process)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [27]:
def html_to_text(url, id, topic_category, further_processing):
    # Get response from url
    response = requests.get(url)

    if response.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        base_dir = 'Academics_data/documents'
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        # Create doc txt file
        doc_file_name = os.path.join(base_dir, f'{topic_category}_{id}.txt')
        text = soup.get_text().strip()
        text = re.sub('\s+', ' ', text)

        processed_text = further_processing(text)
        with open(doc_file_name, "w", encoding="utf-8") as file:
            file.write(processed_text)
        
        # Get metadata
        metadata_json = {}
        title_tag = soup.title
        if title_tag:
            metadata_json['title'] = title_tag.text
        metadata_tags = soup.find_all('meta')

        for tag in metadata_tags:
            name = tag.get('name')
            if name:
                metadata_json[name] = tag.get('content')
            else:
                property_attr = tag.get('property')
                if property_attr:
                    metadata_json[property_attr] = tag.get('content')
                    
        # Create metadata txt file
        meta_file_name = os.path.join(base_dir, f'{topic_category}_{id}_metadata.txt')
        with open(meta_file_name, "w", encoding="utf-8") as meta_file:
            json.dump(metadata_json, meta_file, indent=2)
            # Append the category information to the metadata file
            meta_file.write('\nTopic category is ' + topic_category)


In [30]:
import pandas as pd


def generated(qna_pairs):
    # Load existing CSV file into DataFrame
    existing_file = "Academics_QnA.csv"
    df = pd.read_csv(existing_file)
    
    # Create a new DataFrame from the new data
    new_df = pd.DataFrame(qna_pairs)
    
    # Concatenate the original DataFrame with the new DataFrame
    df = pd.concat([df, new_df], ignore_index=True)
    
    # Save the updated DataFrame back to the CSV file
    df.to_csv(existing_file, index=False)
    
    print("New data added to the CSV file successfully.")


In [38]:
generated(qna_dict)

New data added to the CSV file successfully.


# Last Append

In [81]:
import requests
import fitz  # PyMuPDF

def download_pdf(url, local_filename):
    # Send a GET request to the URL
    response = requests.get(url)
    # Save the PDF locally
    with open(local_filename, 'wb') as f:
        f.write(response.content)
    print(f"PDF downloaded as {local_filename}")

def pdf_to_text(pdf_path, txt_path):
    # Open the downloaded PDF file
    doc = fitz.open(pdf_path)
    
    # Create or overwrite the text file
    with open(txt_path, 'w') as txt_file:
        # Read each page from the PDF and extract text
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            txt_file.write(text)
    
    # Close the PDF document
    doc.close()
    print(f"Text extracted and saved to {txt_path}")

# URL of the PDF file
pdf_url = 'https://lti.cs.cmu.edu/academics/phd-programs/files/handbook_phd_2023-2024.pdf'
# Local path to save the downloaded PDF
pdf_path = 'handbook_phd_2023-2024.pdf'
# The output text file path and name
txt_path = 'handbook_phd_2023-2024.txt'

# Download the PDF from the web link
download_pdf(pdf_url, pdf_path)

# Convert the downloaded PDF to text
pdf_to_text(pdf_path, txt_path)


PDF downloaded as handbook_phd_2023-2024.pdf
Text extracted and saved to handbook_phd_2023-2024.txt


In [82]:
import requests
import fitz  # PyMuPDF

def download_pdf(url, local_filename):
    # Send a GET request to the URL
    response = requests.get(url)
    # Save the PDF locally
    with open(local_filename, 'wb') as f:
        f.write(response.content)
    print(f"PDF downloaded as {local_filename}")

def pdf_to_text(pdf_path, txt_path):
    # Open the downloaded PDF file
    doc = fitz.open(pdf_path)
    
    # Create or overwrite the text file
    with open(txt_path, 'w') as txt_file:
        # Read each page from the PDF and extract text
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            txt_file.write(text)
    
    # Close the PDF document
    doc.close()
    print(f"Text extracted and saved to {txt_path}")

# URL of the PDF file
pdf_url = 'https://lti.cs.cmu.edu/academics/masters-programs/files/mlt-student-handbook-2023-2024.pdf'
# Local path to save the downloaded PDF
pdf_path = 'mlt-student-handbook-2023-2024.pdf'
# The output text file path and name
txt_path = 'mlt-student-handbook-2023-2024.txt'

# Download the PDF from the web link
download_pdf(pdf_url, pdf_path)

# Convert the downloaded PDF to text
pdf_to_text(pdf_path, txt_path)


PDF downloaded as mlt-student-handbook-2023-2024.pdf
Text extracted and saved to mlt-student-handbook-2023-2024.txt


In [83]:
import requests
import fitz  # PyMuPDF

def download_pdf(url, local_filename):
    # Send a GET request to the URL
    response = requests.get(url)
    # Save the PDF locally
    with open(local_filename, 'wb') as f:
        f.write(response.content)
    print(f"PDF downloaded as {local_filename}")

def pdf_to_text(pdf_path, txt_path):
    # Open the downloaded PDF file
    doc = fitz.open(pdf_path)
    
    # Create or overwrite the text file
    with open(txt_path, 'w') as txt_file:
        # Read each page from the PDF and extract text
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            txt_file.write(text)
    
    # Close the PDF document
    doc.close()
    print(f"Text extracted and saved to {txt_path}")

# URL of the PDF file
pdf_url = 'https://lti.cs.cmu.edu/academics/masters-programs/files/miis-handbook_2023-2024.pdf'
# Local path to save the downloaded PDF
pdf_path = 'miis-handbook_2023-2024.pdf'
# The output text file path and name
txt_path = 'miis-handbook_2023-2024.txt'

# Download the PDF from the web link
download_pdf(pdf_url, pdf_path)

# Convert the downloaded PDF to text
pdf_to_text(pdf_path, txt_path)


PDF downloaded as miis-handbook_2023-2024.pdf


UnicodeEncodeError: 'charmap' codec can't encode character '\uf0a7' in position 1204: character maps to <undefined>

In [84]:
def pdf_to_text(pdf_path, txt_path):
    # Open the downloaded PDF file
    doc = fitz.open(pdf_path)
    
    # Create or overwrite the text file with UTF-8 encoding
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        # Read each page from the PDF and extract text
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            txt_file.write(text)
    
    # Close the PDF document
    doc.close()
    print(f"Text extracted and saved to {txt_path}")


In [85]:
# URL of the PDF file
pdf_url = 'https://lti.cs.cmu.edu/academics/masters-programs/files/miis-handbook_2023-2024.pdf'
# Local path to save the downloaded PDF
pdf_path = 'miis-handbook_2023-2024.pdf'
# The output text file path and name
txt_path = 'miis-handbook_2023-2024.txt'

# Download the PDF from the web link
download_pdf(pdf_url, pdf_path)

# Convert the downloaded PDF to text
pdf_to_text(pdf_path, txt_path)


PDF downloaded as miis-handbook_2023-2024.pdf
Text extracted and saved to miis-handbook_2023-2024.txt


In [86]:
# URL of the PDF file
pdf_url = 'https://lti.cs.cmu.edu/academics/masters-programs/files/mcds-student-handbook-2023_2024.pdf'
# Local path to save the downloaded PDF
pdf_path = 'mcds-student-handbook-2023_2024.pdf'
# The output text file path and name
txt_path = 'mcds-student-handbook-2023_2024.txt'

# Download the PDF from the web link
download_pdf(pdf_url, pdf_path)

# Convert the downloaded PDF to text
pdf_to_text(pdf_path, txt_path)


PDF downloaded as mcds-student-handbook-2023_2024.pdf
Text extracted and saved to mcds-student-handbook-2023_2024.txt


In [87]:
# URL of the PDF file
pdf_url = 'https://lti.cs.cmu.edu/academics/masters-programs/files/handbook-msaii-2022-2023.pdf'
# Local path to save the downloaded PDF
pdf_path = 'handbook-msaii-2022-2023.pdf'
# The output text file path and name
txt_path = 'handbook-msaii-2022-2023.txt'

# Download the PDF from the web link
download_pdf(pdf_url, pdf_path)

# Convert the downloaded PDF to text
pdf_to_text(pdf_path, txt_path)


PDF downloaded as handbook-msaii-2022-2023.pdf
Text extracted and saved to handbook-msaii-2022-2023.txt
