In [1]:
pip install newspaper3k

Note: you may need to restart the kernel to use updated packages.


In [2]:
from newspaper import Article
import pandas as pd
import os

In [3]:
def get_article_info(file_location):
    with open(file_location, 'rb') as fh:
        html = fh.read()
    article = Article(url = file_location)
    article.set_html(html)
    article.parse()
    
    article_details = {'title'       : article.title,
                       'text'        : article.text,
                       'url'         : article.meta_data['og'].get('url', article.url),
                       'authors'     : article.authors,
                       'date'        : article.publish_date,
                       'description' : article.meta_description,
                       'site'         : article.meta_data['og'].get('site_name', ''),
                       'publisher'   : article.meta_data['publisher']}
    

    return article_details

In [4]:

def load_existing_data(json_file):
    """Load existing JSON data into a DataFrame."""
    try:
        return pd.read_json(json_file)
    except (ValueError, FileNotFoundError):
        return pd.DataFrame()

def is_file_processed(df, file_path):
    """Check if a file has been processed."""
    if 'file_location' in df.columns:
        return df['file_location'].isin([file_path]).any()
    else:
        return False

def update_dataframe(df, file_path):
    """Update the DataFrame with new article information."""
    article_info = get_article_info(file_path)
    article_info['file_location'] = file_path
    
    # Check if the DataFrame is empty and initialize columns if necessary
    if df.empty:
        for key in article_info.keys():
            df[key] = pd.Series(dtype='object')
    
    # Add the new article information as a new row
    new_row_index = len(df)
    df.loc[new_row_index] = article_info
    
    return df


def save_to_json(df, json_file):
    """Save the DataFrame to a JSON file."""
    df.to_json(json_file, orient="records", date_format="iso")

def process_files(folder_path, json_file):
    df = load_existing_data(json_file)
    
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        
        if not is_file_processed(df, file_path):
            try:
                df = update_dataframe(df, file_path)
            except:
                print(file_path)
    
    save_to_json(df, json_file)


In [5]:
# Example usage
folder_path = 'HTML'
json_file = 'articles_info.json'
process_files(folder_path, json_file)


In [6]:
df = pd.read_json('articles_info.json')
print(len(df))
df.sample(3)

3019


Unnamed: 0,title,text,url,authors,date,description,site,publisher,file_location
644,VIDEO: South Carolinians to rally this weekend...,Police: Baby died after mother mistakenly put ...,https://www.wbrc.com/video/2023/06/12/video-so...,[],2023-06-12T00:00:00.000,,https://www.wbrc.com,{},HTML/https-www-wbrc-com-video-2023-06-12-video...
2919,Telegram: Contact @WhiteLivesMatterOfficial,,:/HTML/https-t-me-whitelivesmatterofficial-880...,[],,,Telegram,{},HTML/https-t-me-whitelivesmatterofficial-8804....
2732,Ballston Spa eyes new permit requirements afte...,Belmont Stakes at Saratoga: What we'll see\n\n...,https://www.dailygazette.com/archives/ballston...,[Tyler A. Mcneil],2023-08-06T00:00:00.000,BALLSTON SPA — Ballston Spa officials have cal...,The Daily Gazette,{},HTML/https-dailygazette-com-2023-08-06-ballsto...
