In [1]:
import os

def delete_files_in_directory(directory):
    try:
        # Iterate over all files and directories in the specified directory
        for root, dirs, files in os.walk(directory):
            for file in files:
                # Construct the file path
                file_path = os.path.join(root, file)
                
                # Remove the file
                os.remove(file_path)
    except Exception as e:
        print(f"Error deleting files in directory: {e}")


In [2]:
import os
import zipfile
import shutil
import uuid

def extract_zips_and_move(src_dir, dest_dir):
    # Iterate over all files and directories in the source directory
    for root, dirs, files in os.walk(src_dir):
        for file in files:
            # Check if the file is a zip file
            if file.endswith('.zip') or file.endswith('.ZIP'):
                zip_path = os.path.join(root, file)
                
                # Create a temporary directory to extract the files
                temp_dir = os.path.join(dest_dir, 'temp_extract')
                os.makedirs(temp_dir, exist_ok=True)
                
                try:
                    # Extract the zip file
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        print(zip_path)
                        zip_ref.extractall(temp_dir)
                    
                    for extracted_file in os.listdir(temp_dir):
                        extracted_file_path = os.path.join(temp_dir, extracted_file)
                        # Check if the file already exists in the destination directory
                        dest_file_path = os.path.join(dest_dir, extracted_file)
                        if os.path.exists(dest_file_path):
                           continue
                        
                        shutil.move(extracted_file_path, dest_file_path)
                    
                    
                except Exception as e:
                    print(f"Error extracting {zip_path}: {e}")
                
                finally:
                    # Clean up the temporary directory
                    shutil.rmtree(temp_dir)

source_directory = 'data/raw_data'
destination_directory = 'data/rtf_data'

delete_files_in_directory(destination_directory)
extract_zips_and_move(source_directory, destination_directory)


data/raw_data/1995-1997/Files (100) (9).zip
data/raw_data/1995-1997/Files (100) (5).ZIP
data/raw_data/1995-1997/Files (100) (4).ZIP
data/raw_data/1995-1997/Files (100) (8).ZIP
data/raw_data/1995-1997/Files (100) (3).ZIP
data/raw_data/1995-1997/Files (100) (2).ZIP
data/raw_data/1995-1997/Files (100) (1).ZIP
data/raw_data/1995-1997/Files (100).zip
data/raw_data/1995-1997/Files (100) (7).ZIP
data/raw_data/1995-1997/Files (100) (6).ZIP
data/raw_data/2013/Files (100) (9).ZIP
data/raw_data/2013/Files (100) (5).ZIP
data/raw_data/2013/Files (100) (4).ZIP
data/raw_data/2013/Files (100) (8).ZIP
data/raw_data/2013/Files (100) (3).ZIP
data/raw_data/2013/Files (100) (2).ZIP
data/raw_data/2013/Files (100) (1).ZIP
data/raw_data/2013/Files (100).zip
data/raw_data/2013/Files (100) (7).ZIP
data/raw_data/2013/Files (100) (6).ZIP
data/raw_data/2014/Files (100) (9).ZIP
data/raw_data/2014/Files (100) (5).ZIP
data/raw_data/2014/Files (100) (4).ZIP
data/raw_data/2014/Files (100) (8).ZIP
data/raw_data/2014/Fil

In [3]:
import os
from striprtf.striprtf import rtf_to_text

def extract_text_from_rtf_files(input_directory, output_directory):
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Iterate over all files and directories in the input directory
    for root, dirs, files in os.walk(input_directory):
        for file_name in files:
            if file_name.endswith('.rtf'):
                input_file_path = os.path.join(root, file_name)
                output_file_path = os.path.join(output_directory, os.path.splitext(file_name)[0] + ".txt")
                try:
                    # Extract text from the RTF file
                    with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as rtf_file:
                        rtf_text = rtf_file.read()
                        text = rtf_to_text(rtf_text).strip().replace('\xa0', ' ')
                    
                    # Write the extracted text to a TXT file
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        output_file.write(text)
                
                except Exception as e:
                    print(f"Error extracting text from {input_file_path}: {e}")


# Example usage
input_directory = 'data/rtf_data'
output_directory = 'data/txt_data'

delete_files_in_directory(output_directory)
extract_text_from_rtf_files(input_directory, output_directory)


In [13]:
import re

def delete_within_parentheses_and_after_commas(string):
    # Delete text within parentheses
    string = re.sub(r'\([^)]*\)', '', string)
    # Delete text after commas
    string = re.sub(r',.*', '', string)
    return string

publisher_file = "publishers"

with open(publisher_file, 'r', encoding='utf-8') as file:
    # Read all lines from the file and store them in the array
    publishers = file.readlines()

publishers = [delete_within_parentheses_and_after_commas(s.strip().lower()) for s in publishers if len(s) > 0 and s[0].isalpha()]
print(publishers)

['the new york times', 'university wire', 'st. louis post-dispatch ', 'pittsburgh post-gazette', 'the charleston gazette-mail', 'the christian science monitor', 'the philadelphia inquirer', 'the deseret news', 'telegraph herald ', 'usa today', 'tampa bay times', 'the atlanta journal-constitution', 'daily news ', 'chicago daily herald', 'dayton daily news ', 'the tampa tribune ', 'the bismarck tribune', 'richmond times dispatch ', 'new york sun ', 'star tribune ', 'the columbian ', 'the salt lake tribune', 'the spokesman-review', 'new york times abstracts', 'the philadelphia daily news', 'bangor daily news ', 'pittsburgh tribune review', 'the pantagraph', 'wisconsin state journal', 'lnp ', 'the forward', 'business insurance', 'the capital ', 'lancaster  newspapers', 'portland press herald', 'the will ', 'the santa fe new mexican', 'contentengine think tank newswire english', 'ce noticias financieras ', 'american banker', 'anchorage daily news', 'air force times - newspaper edition', 'th

In [14]:
import os
import re
import json
from datetime import datetime

def extract_article_info(title, article_text):
    publisher = "PUBLISHER NOT FOUND"
    for i in range(len(publishers)):
        if article_text.lower().find(publishers[i]) != -1:
            publisher = publishers[i]
            break
            
    if publisher == "PUBLISHER NOT FOUND":
        print(title)
        print(article_text.split('\n')[1])

    try:
        match = re.search(r'(?i)\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May?|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) (0?[1-9]|[12][0-9]|3[01]), [0-9]+', article_text)
        date = match.group()

        date = datetime.strptime(date, '%B %d, %Y')
        day = date.day
        month = date.month
        year = date.year
    except AttributeError:
        year = "0000"
        month = "00"
        day = "00"   
    except ValueError:
        year = "0000"
        month = "00"
        day = "00"

    full_text = "\n".join(article_text.split("\n")[article_text.split("\n").index("Body")+1:article_text.split('\n').index("End of Document")]).strip().strip('\n')

    article_info = {"title": title, 
                    "publisher": publisher,
                    "year": year, 
                    "month": month, 
                    "day": day, 
                    "full text": full_text}
    
    return article_info

def process_txt_files(input_directory, output_directory):
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Iterate over all files and directories in the input directory
    for root, dirs, files in os.walk(input_directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                input_file_path = os.path.join(root, file_name)
                output_file_path = os.path.join(output_directory, os.path.splitext(file_name)[0] + ".json")
                try:
                    # Read the text file
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        article_text = file.read()
                    
                    # Extract article information
                    if "Body" in article_text:
                        article_info = extract_article_info(os.path.splitext(file_name)[0], article_text)
                    else:
                         continue

                    # Write the extracted information to a JSON file
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        json.dump(article_info, output_file, ensure_ascii=False, indent=4)
                
                except Exception as e:
                    print(f"Error processing {input_file_path}: {e}")

# Example usage
input_directory = 'data/txt_data'
output_directory = 'data/json_data'

delete_files_in_directory(output_directory)
process_txt_files(input_directory, output_directory)



In [15]:
import os
import json
import csv

def parse_json_files_to_csv(input_directory, output_csv):
    # Initialize a list to store all rows
    all_rows = []

    # Iterate over all files and directories in the input directory
    for root, dirs, files in os.walk(input_directory):
        for file_name in files:
            if file_name.endswith('.json'):
                input_file_path = os.path.join(root, file_name)
                try:
                    # Read the JSON file
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        data = json.load(file)
                    
                    # Append the data from the JSON file to the list of rows
                    all_rows.append(data)
                
                except Exception as e:
                    print(f"Error parsing {input_file_path}: {e}")

    print(len(all_rows))
    # Get the keys (column names) from the first JSON object
    if all_rows:
        fieldnames = list(all_rows[0].keys())
    else:
        print("No JSON files found.")
        return

    # Write the data to a CSV file
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()  # Write the header row
        for row in all_rows:
            writer.writerow(row)

# Example usage
input_directory = 'data/json_data'
output_csv = 'data/output.csv'

parse_json_files_to_csv(input_directory, output_csv)


24175
