In [7]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document
import json
import re

In [8]:
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def extract_texts_from_directory(directory_path):
    data = []

    for file_name in os.listdir(directory_path):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(directory_path, file_name)
            text = extract_text_from_pdf(file_path)
            
            # Remove '.pdf' extension from file_name
            file_name_without_extension = os.path.splitext(file_name)[0]
            
            data.append({'File Name': file_name_without_extension, 'Text': text})

    df = pd.DataFrame(data)

    return df

def save_to_csv(df, output_path):
    df.to_csv(output_path, sep='\t', index=False)

def save_to_json(data, output_path):
    with open(output_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def clean_text(text):
    # Replace non-ASCII and control characters with a space
    return re.sub(r'[^\x20-\x7E]', ' ', text)

def save_to_word(df, output_path):
    doc = Document()
    table = doc.add_table(rows=1, cols=2)

    # Add headers
    for i, column_name in enumerate(df.columns):
        table.cell(0, i).text = clean_text(column_name)

    # Add data
    for index, row in df.iterrows():
        cells = table.add_row().cells
        cells[0].text = clean_text(str(row['File Name']))
        cells[1].text = clean_text(str(row['Text']))

    doc.save(output_path)


In [9]:
# Example usage
directory_path = './PDFs'
output_tsv_path = 'vacTrials.tsv'
output_json_path = 'vacTrials.json'
output_word_path = 'vacTrials.docx'



In [10]:
df = extract_texts_from_directory(directory_path)


In [11]:
# Save to TSV
save_to_csv(df, output_tsv_path)

# Convert DataFrame to list of dictionaries and save to JSON
data_list = df.to_dict('records')
save_to_json(data_list, output_json_path)

# Save to Word
save_to_word(df, output_word_path)


print("Data exported successfully to", output_tsv_path)
print("Data exported successfully to", output_json_path)
print("Data exported successfully to", output_word_path)

Data exported successfully to vacTrials.tsv
Data exported successfully to vacTrials.json
Data exported successfully to vacTrials.docx
