# Extracting PolitiFact and GossipCop Datasets into .csv Files from Downloaded JSON Files

**Reference to FakeNewsNet GitHub Repository**: https://github.com/KaiDMML/FakeNewsNet

In [None]:
# Imports the required libraries for loading nad processing the datasets
import pandas as pd
import os
import json

In [None]:
# Mounts the Google Drive for Colab access to datasets
from google.colab import drive
drive.mount("/content/drive")

"""
  Sets up the paths to load in the datasets from Google Drive
"""

# Sets up thepath to Google Drive Colab folder containing datasets
ds_path = "/content/drive/My Drive/Colab Notebooks/"

# Sets up the paths to FakeNewsNet datasets
gc_path = ds_path + "gossipcop"
pf_path = ds_path + "politifact"

Mounted at /content/drive


In [None]:
# Loads the "news content.json" files from GossipCop and PolitiFact to store news content in a DataFrame and save to .csv

def getTextFromJSON(json_object):
    """
    Extracts the "text" field from the PolitiFact and GossipCop JSON objects representing each news text.
    
        Input Parameters:
        json_object (dict): JSON news content for each dataset file
    
        Output:
        str: The extracted news text content
    """

    # Checks whether the json_object is a dictionary data type before proceeding
    if isinstance(json_object, dict):
        
        # Uses .get for robustness in case the text field is missing from dict
        return json_object.get("text", '').strip() # Strips the whitespace from text and returns the news content
        
    # Returns an empty string if could not get the news text out of JSON object
    return ''


def saveLoadingProgress(data, dataset_name, save_directory, num_articles):
  """
    Saves the current news-text extraction progress to a .csv file

    Input Parameters:

      data (list): A list of dicts containing news texts
      dataset_name (str): The name of the dataset being collected (PolitiFact, GossipCop)
      save_directory (str): Directory to save the csv file to
      num_articles (int): Total number of articles processed so far
  """
    
  # Creates a new directory for saving the csv files into
  dataset_dir = os.path.join(save_directory, f"{dataset_name}_saved_articles")
    
  # Creates a new dir only if doesn't exist
  os.makedirs(dataset_dir, exist_ok=True)

  # Creates a filename with the current count of processed news articles
  filename = f"news_articles_{num_articles}.csv"
    
  # Creates a filepath to save to
  filepath = os.path.join(dataset_dir, filename)

  # Converts the list of dicts containing news sample data (id, text, label) to a pandas DataFrame and save it to csv
  df = pd.DataFrame(data)
  df.to_csv(filepath, index=False)

  # Logs the message that the save has been successful
  print(f"\nSaved {len(data)} articles to {filepath}")


def loadNews(path, dataset, max_samples=None):
    """
    Loads either GossipCop or PolitiFact news texts from the "<dataset_name>/<fake_or_real>/<sample_id>/news content.json" path.
    E.g. from "gossipcop/fake/gossipcop-956103/news content.json"

        Input Parameters:
          path (str): Path to the dataset directory (GossipCop or PolitiFact)
          dataset (str): Dataset name ("gossipcop" or "politifact")
          max_samples (int): Maximum number of news samples to process. Default (None) means process all the news samples
    
        Output:
          pandas.DataFrame: DataFrame with columns 'text', 'label', and metadata
          OR
          None if error loading in data
    """

    # Defines a list to store all of the dicts containing news samples:
    # keys will be: id (of sample), news text and label (1 = fake, 0 = real)
    news_data = []

    # Counts how many articles in this category have been processed to track the loading progress
    counter = {"real": 0, "fake": 0}

    # Keeps track of progress by keeping a count of total articles processed
    total_articles_processed = 0

    # Iterates over the directories for both real and fake news
    for news_cat in ["real", "fake"]:

       # Sets up the path to the real or fake news directory for the dataset
        news_cat_path = os.path.join(path, news_cat)

        # Checks if this path/folder exists for this category (real or fake) and dataset
        if not os.path.exists(news_cat_path):
          print(f"Error: '{news_cat_path}' path does not exist")
          # Returns None if the directory does not exist
          return None
        else:
          # Prints out the directory exits
          print("Path exists")

        # Retrieves all the subdirectories (one per news sample) in this particular news category
        subdirectories = [subdir for subdir in os.listdir(news_cat_path)]
        # Prints the number of samples in the real or fake category
        num_dirs = len(subdirectories)
        
        print(f"Log: Done collecting {num_dirs} subdirectory names for category: {news_cat}")

        # Iterates through the subdirectories in the news (fake vs real) category for this dataset
        for subdir in subdirectories:
            
          # Ensures this only runs when max_samples is not None and when have not yet exceeded the number of max. samples to collect
          if max_samples is not None and counter[news_cat] >= max_samples:
              break  

          # Gets the path to each of the JSON news text samples
          sample_path = os.path.join(news_cat_path, subdir, "news content.json")

          # Checks if the news text path exists and is not empty (as some of them are, so ignore these)
          if os.path.exists(sample_path) and os.path.getsize(sample_path) > 0: 
              
            # Tries to open the sample, if cannot open JSON, then handles the Exception
            try:
                
              with open(sample_path, 'r', encoding="utf-8") as f:
                # Loads the JSON from news content file if it isn't empty
                news_content = json.load(f) 

              # Extracts the news text using the helper function to parse the JSON to extract news content which is under the "text" field,
              # checks text is not an empty string before adding it
              if text_content and text_content.strip():
                news_data.append({
                        "id": subdir, # Stores the sample ID
                        "text": text_content, # Stores the news text
                        "label": 1 if news_cat == "fake" else 0 # Stores the label (0 is real, 1 is fake news)
                    })

                # Updates the progress of how many articles processed
                total_articles_processed += 1

              # Prints the progress of how many samples have been extracted from the real or fake news directory for the dataset
              counter[news_cat] += 1
              print(f"{counter[news_cat]}/{num_dirs}...")

              # Saves the progress to disk cumulatively every 1000 processed articles, so do not have to restart if something goes wrong
              if total_articles_processed % 1000 == 0:
                  saveLoadingProgress(
                      news_data,
                      dataset,
                      ds_path,
                      total_articles_processed
                    )

            except (json.JSONDecodeError, UnicodeDecodeError) as e:
              print(f"Skipping the file {sample_path} due to this loading error: {str(e)}")
              continue

    # Saves the final news data collected
    if news_data:
        saveLoadingProgress(
            news_data,
            dataset,
            ds_path,
            total_articles_processed
        )

    # Creates a DataFrame from collected data
    df = pd.DataFrame(news_data)

    print(f"Number of total articles extracted from all real and fake subfolders: {len(df)}")

    # Checks for missing values in resulting dataset
    print(f"\nMissing values: {df.isnull().sum()}")

    return df

In [None]:
# Extracts the PolitiFact articles (will take hours because of large dataset)
politifact_df = loadNews(pf_path, 'politifact')

Path exists
Log: Done collecting 624 subdirectory names for category: real
1/624...
2/624...
3/624...
4/624...
5/624...
6/624...
7/624...
8/624...
9/624...
10/624...
11/624...
12/624...
13/624...
14/624...
15/624...
16/624...
17/624...
18/624...
19/624...
20/624...
21/624...
22/624...
23/624...
24/624...
25/624...
26/624...
27/624...
28/624...
29/624...
30/624...
31/624...
32/624...
33/624...
34/624...
35/624...
36/624...
37/624...
38/624...
39/624...
40/624...
41/624...
42/624...
43/624...
44/624...
45/624...
46/624...
47/624...
48/624...
49/624...
50/624...
51/624...
52/624...
53/624...
54/624...
55/624...
56/624...
57/624...
58/624...
59/624...
60/624...
61/624...
62/624...
63/624...
64/624...
65/624...
66/624...
67/624...
68/624...
69/624...
70/624...
71/624...
72/624...
73/624...
74/624...
75/624...
76/624...
77/624...
78/624...
79/624...
80/624...
81/624...
82/624...
83/624...
84/624...
85/624...
86/624...
87/624...
88/624...
89/624...
90/624...
91/624...
92/624...
93/624...
94/6

In [None]:
# Saves the final DataFrame as .csv
politifact_csv_path = os.path.join(ds_path, "politifact.csv")
politifact_df.to_csv(politifact_csv_path, index=False)

In [None]:
# Extracts the GossipCop articles (will take hours because of large dataset)
gossip_df = loadNews(gc_path, 'gossipcop')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
15265/16817...
15266/16817...
15267/16817...
15268/16817...
15269/16817...
15270/16817...
15271/16817...
15272/16817...
15273/16817...
15274/16817...
15275/16817...
15276/16817...
15277/16817...
15278/16817...
15279/16817...
15280/16817...
15281/16817...
15282/16817...
15283/16817...
15284/16817...
15285/16817...
15286/16817...
15287/16817...
15288/16817...
15289/16817...
15290/16817...
15291/16817...
15292/16817...
15293/16817...
15294/16817...
15295/16817...
15296/16817...
15297/16817...
15298/16817...
Path exists
Log: Done collecting 5323 subdirectory names for category: fake
1/5323...
2/5323...
Handled 15300 news texts in the fake news category out of 5323 so far...
3/5323...
4/5323...
5/5323...
6/5323...
7/5323...
8/5323...
9/5323...
10/5323...
11/5323...
12/5323...
13/5323...
14/5323...
15/5323...
16/5323...
17/5323...
18/5323...
19/5323...
20/5323...
21/5323...
22/5323...
23/5323...
24/5323...
25/5323...
26/5323...

In [None]:
# Saves the final DataFrame as .csv
gossipcop_csv_path = os.path.join(ds_path, "gossipcop.csv")
gossip_df.to_csv(gossipcop_csv_path, index=False)