# Fake News dataset exploration & preprocessing

In [1]:
import pandas as pd

from collections import Counter

In [2]:
bodies  = pd.read_csv('data/fake_news/bodies.csv')
stances = pd.read_csv('data/fake_news/stances.csv')

In [3]:
# Check for null values
bodies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1683 entries, 0 to 1682
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Body ID      1683 non-null   int64 
 1   articleBody  1683 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB


In [4]:
stances.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49972 entries, 0 to 49971
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Headline  49972 non-null  object
 1   Body ID   49972 non-null  int64 
 2   Stance    49972 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [5]:
# Check for duplicates
unique_count = bodies.groupby('articleBody').agg(['unique', 'nunique'])
duplicates   = unique_count[unique_count['Body ID']['nunique'] > 1]['Body ID']['unique'].tolist()

print(f'Number of body duplicates: {len(duplicates)}')

Number of body duplicates: 14


In [6]:
# Map second duplicate in a pair to ID of the first body
duplicates = {non_first_duplicate: ids[0] for ids in duplicates for non_first_duplicate in ids[1:]}

# Remove duplicates
bodies = bodies.drop_duplicates(subset='articleBody', keep='first')

print(f'Number of bodies after duplicate removal: {len(bodies)}')

Number of bodies after duplicate removal: 1669


In [7]:
# Fix the Body IDs of removed duplicate bodies in the Stances table
stances['Body ID'] = stances['Body ID'].apply(lambda body_id: duplicates[body_id] if body_id in duplicates else body_id)

# Remove duplicated Headline - Body pairs
stances = stances.drop_duplicates(subset=['Headline', 'Body ID'], keep='first')

print(f'Number of headline-body pairs after duplicate removal: {len(stances)}')

Number of headline-body pairs after duplicate removal: 49538


In [8]:
unique_heads = set(stances["Headline"])

print(f'Number of unique headlines: {len(unique_heads)}')

Number of unique headlines: 1648


In [9]:
# Replace headline by ID in Stances table
max_body_id = max(stances['Body ID']) + 1
head_to_id  = {headline: idx + max_body_id for idx, headline in enumerate(unique_heads)}

stances['Headline'] = stances['Headline'].apply(lambda headline: head_to_id[headline])

In [10]:
# Check for data points with conflicting labels
num_conflicts = (stances.groupby(['Headline', 'Body ID']).agg('nunique') > 1)['Stance'].sum()

print(f'Num conflicting data points: {num_conflicts}')

Num conflicting data points: 0


In [11]:
# Check class distribution
dist = sorted(Counter(stances['Stance']).items(), key=lambda x: x[1])

print(f'Dataset size: {len(stances)}\n')
print('Class distribution:')

for label, count in dist:
    print(f'    {label:<9s}: {count * 100 / len(stances):>5.2f}%, ({count:>5d})')

Dataset size: 49538

Class distribution:
    disagree :  1.67%, (  829)
    agree    :  7.35%, ( 3643)
    discuss  : 17.77%, ( 8804)
    unrelated: 73.20%, (36262)


## Dataset partitioning problem

