In [4]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tf_idf(json_file):

    with open(json_file, 'r') as f:
        data = json.load(f)

    # Extract content from each article
    corpus = [" ".join(article["content"]) for article in data]

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the corpus
    X = vectorizer.fit_transform(corpus)

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Create a list of dictionaries, each containing title and TF-IDF scores
    tf_idf_results = []
    for i, article in enumerate(data):
        title = article["title"]
        tfidf_scores = dict(zip(feature_names, X[i].toarray()[0]))
        tf_idf_results.append({"title": title, "tfidf": tfidf_scores})

    return tf_idf_results

json_file = "cleanup.json"
tf_idf_results = calculate_tf_idf(json_file)

# Save the results to a JSON file
with open("tf-idf.json", "w") as f:
    json.dump(tf_idf_results, f, indent=4)