In [None]:
# Install pandas
!pip install pandas

In [None]:
import pandas as pd
import json
from collections import defaultdict
from itertools import combinations

In [None]:
pd.set_option('display.max_colwidth', 50)

In [None]:
import os

# Define the path to the input-data folder
input_data_folder = '../input-data'

# List all files in the input-data folder
files = os.listdir(input_data_folder)

# Initialize an empty list to store dataframes
dataframes = []

# Loop through each file and read it into a dataframe
for file in files:
    file_path = os.path.join(input_data_folder, file)
    df = pd.read_json(file_path)
    dataframes.append(df)

# Concatenate all dataframes into one unique dataframe
full_year = pd.concat(dataframes, ignore_index=True)

# Display the unique dataframe
full_year.sample(2)

In [None]:
len(full_year)

This notebook has two goals. The first is to find what are the most common categories used across news coverage. The second is to find how much overlap there is between categories. Ultimately, the idea is to create a dataset where each category is a node, and based on the amount of overlap between two categories, a corresponding dataset of links where related nodes are more or less distant depending on how many articles these two categories have in common.

## What categories are used the most?

First, I clean the data to remove unused features. For this task, I need only:
- `_id`, which identifies the unique article in the dataset
- `section_name`, which defines the macro-category the article belongs to
- `keywords`, a list of words that describe the article
- `pub_date`, the publication date of the article
- `pub_month`, can be exctracted from `pub_date` and will help in filtering the dataset

In [None]:
# Make a copy of the data
full_year_essential = full_year[["_id", "section_name", "keywords", "pub_date"]].copy()

In [None]:
# Converts the object-like string in keywords to a list
full_year_essential['keywords'] = full_year_essential['keywords'].apply(lambda x: [keyword['value'] for keyword in x])

In [None]:
# Convert the pub_date string into a valid datetime object
full_year_essential["pub_date"] = pd.to_datetime(full_year_essential["pub_date"])
# Create a column with only the publication month
full_year_essential["pub_month"] = full_year_essential["pub_date"].dt.month

In [None]:
full_year_essential.sample(2)

Now that I have all the essentials, I need to explode the dataset so that each category becomes a row. 

In [None]:
# Another copy of the data
exploded_categories = full_year_essential.copy()

In [None]:
exploded_categories = exploded_categories.explode('keywords')

In [None]:
exploded_categories = exploded_categories.rename(columns={"keywords": "keyword"})

This is a little test to check whether or not the explode function worked properly:

In [None]:
# Change to none if the output is truncated
pd.set_option('display.max_colwidth', None)

article_test = full_year_essential[full_year_essential['_id'] == 'nyt://video/89abe124-169d-590e-8db8-6eaf21316500']
print(article_test["keywords"])

In [None]:
categories_test = exploded_categories[exploded_categories['_id'] == 'nyt://video/89abe124-169d-590e-8db8-6eaf21316500']
categories_test

In [None]:
nodes = exploded_categories.groupby(['keyword']).size().reset_index(name='count')
nodes

The resulting dataframe has only two columns. When creating the visualization, the `keyword` column will be the name of our node. The `count`column will become the size of the node.

In [None]:
nodes.sort_values(by="count", ascending=False)

## Find overlap between categories

Taking a list of categories from one article, for example [banana, apple, carrot], I end up with three separate nodes: banana, apple, and carrot. These three nodes are related to each other, because they appear in the same list. By taking each keyword by itself and pairing it with other ones, I am able to find how many times said keywords appear together throughout the dataset. This information allows me to determine how much overlap there is between different categories of articles. 

Below, I iterate over the individual keywords for each article and create a pair of words `(word1, word2)` and count how many times said pairing repeats. Then, I save the result in a dataset that will represent the edges of my network. Each entry has a `source`, `target`, and `strength` property, with `strength` being the frequency of each pairing throughout the data.

In [None]:
# Dictionary to store pair frequencies
pair_counts = defaultdict(int)

# Iterate over rows to count co-occurrences
for keywords in full_year_essential["keywords"]:
    for word1, word2 in combinations(sorted(keywords), 2):  # Sort to avoid duplicates (a,b) and (b,a)
        pair_counts[(word1, word2)] += 1

# Convert to DataFrame
edges = pd.DataFrame([(k[0], k[1], v) for k, v in pair_counts.items()], columns=["source", "target", "strength"])

print(edges)

In [None]:
edges.sort_values(by="strength", ascending=False)

## Combine the nodes and edges datasets

Maybe there is a smarter way to do this, but basically I just need to export these data to a json with two properties: nodes and edges. This will make it easy to handle the data in D3. First, I export two separate files and then I import them as jsons and merge them together.

In [None]:
nodes.to_json('../data/temp-nodes.json', orient='records')

In [None]:
edges.to_json('../data/temp-edges.json', orient='records')

In [None]:
with open('../data/temp-nodes.json') as f:
    d_nodes = json.load(f)
    print(d_nodes)

In [None]:
with open('../data/temp-edges.json') as f:
    d_edges = json.load(f)
    print(d_edges)

In [None]:
network = {}
network['nodes'] = d_nodes
network['edges'] = d_edges

In [None]:
with open('../data/network.json', 'w') as f:
    json.dump(network, f)