# Exploration of 995k Fake News Corpus

## Raw dataset

In [None]:
import pandas as pd
import seaborn as sns

# load raw dataset
src = 'data/995,000_rows.csv'
raw_data = pd.read_csv(src, index_col=0)

### Empty cells in dataset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# amount of empty cells in dataset
sns.heatmap(raw_data.isnull(), cbar=False)
plt.title('Empty cells in raw dataset')


raw_data.isnull().sum()

### Destribution of types

In [None]:
# percentage destribution of types
type_dist = raw_data['type'].value_counts(normalize=True) * 100
print(type_dist)

### Oberservations of domains

In [None]:
# Filter the DataFrame for 'reliable' and 'fake' types
reliable_domains = set(raw_data[raw_data['type'] == 'reliable']['domain'].unique())
fake_domains = set(raw_data[raw_data['type'] == 'fake']['domain'].unique())

# Domains in 'reliable' but not in 'fake'
reliable_not_fake_domains = reliable_domains - fake_domains

# Domains in 'fake' but not in 'reliable'
fake_not_reliable_domains = fake_domains - reliable_domains

# Find the intersection of unique domains
common_domains = fake_domains.intersection(reliable_domains)

print("Domains in 'reliable' but not in 'fake':")
print(reliable_not_fake_domains)

print("\nDomains in 'fake' but not in 'reliable':")
print(fake_not_reliable_domains)

print("\nDomains in both 'fake' and 'reliable':")
print(common_domains)

### Amount of '!' (exclamations) in fake news vs. reliable news

There seems to be a lot more '!'-characters in fake labelled articles

In [None]:
import matplotlib.pyplot as plt

char = '!'

# copy dataframe
exclm_data = raw_data.copy(deep=True)

# get count sum of exclamation points in each article
exclm_data['exclm_count'] = exclm_data['content'].str.count(char)

# get total sum of exclamation points for each type (labels)
fake_exclm_sum = (exclm_data[ (exclm_data['type'] == 'fake')])['exclm_count'].mean()
reliable_exclm_sum = (exclm_data[ (exclm_data['type'] == 'reliable')])['exclm_count'].mean()

# plot data
fig, ax = plt.subplots()

ax.set_ylabel('mean')
ax.set_title('\'!\' characters in fake vs. reliable')

ax.bar(['fake', 'reliable'], [fake_exclm_sum, reliable_exclm_sum])

plt.show()

### Amount of '!' (exclamations) in each type of labels

'political' has most exclamations points. Second is 'fake'.

In [None]:
import matplotlib.pyplot as plt

char = '!'

# copy dataframe
exclm_data = raw_data.copy(deep=True)

# get count sum of exclamation points in each article
exclm_data['exclm_count'] = exclm_data['content'].str.count(char)

# get total sum of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (exclm_data[ (exclm_data['type'] == type)])['exclm_count'].mean()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('mean')
ax.set_title('\'!\' characters in all article types')

ax.bar(types, sums)

plt.show()

### Amount of unique words in reliable news vs. fake news

Exploring the mean of different words for each article type. The results shows that 'reliable' has more unique words then 'fake'.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Data is loaded from a new file 'word_freq' to minimize the file-size on the cleaned dataset.
# The filesize of 'word_freq' is small.

src = 'data/word_freq.csv'
word_freq = pd.read_csv(src)

# get total mean of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

means = []
for type in types:
    mean = (word_freq[ (word_freq['type'] == type)])['content_word_freq'].median()
    means.append(mean)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('median')
ax.set_title('Unique words in articles by type')

ax.bar(types, means)

plt.show()


### Do Fake news have less author names then reliable news? 

From the barplot, it seems that 'reliable' news have more missing authors, then 'fake' news. So actually the opposite of our hypothesis.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# get total mean of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (raw_data[(raw_data['type'] == type)])['authors'].isnull().sum()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('Missing authors sum')
ax.set_title('Missing author values by type')

ax.bar(types, sums)

plt.show()

In [None]:
import matplotlib.pyplot as plt

# get all rows with label 'fake'
fake_data = raw_data[(raw_data['type'] == 'fake')]

# count rows for 'fake' with no author names 
fake_auth_isNull_sum = fake_data['authors'].isnull().sum()

# get all rows with label 'reliable'
reliable_data = raw_data[(raw_data['type'] == 'reliable')]

# count rows for 'reliable' with no author names
reliable_auth_isNull_sum = reliable_data['authors'].isnull().sum()

# plot comparison
fig, ax = plt.subplots()

ax.set_ylabel('Missing authors sum')
ax.set_title('Missing author values: \'fake\' vs. \'reliable\' news')

ax.bar(['fake', 'reliable'], [fake_auth_isNull_sum, reliable_auth_isNull_sum])

plt.show()

### Correlations between word reduction rates between reliable vs. fake?

Exploring if 'fake' have an avarage higher reduction rate then 'reliable'. The results shows the 'fake' has a higher reduction rate then 'reliable'.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data is loaded from a new file 'word_freq' to minimize the file-size on the cleaned dataset.
# The filesize of 'word_freq' is small.

src = 'data/word_freq.csv'
word_freq = pd.read_csv(src)

# get total mean of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

stop_means = []
stem_means = []
for type in types:
    stop_mean = (word_freq[ (word_freq['type'] == type)])['stop_reduction_rate'].median()
    stem_mean = (word_freq[ (word_freq['type'] == type)])['stem_reduction_rate'].median()
    stop_means.append(round(stop_mean, 1))
    stem_means.append(round(stem_mean, 1))

# plot data
fig, ax = plt.subplots(figsize=(8,5))

x = np.arange(len(types))
width = 0.4  # the width of the bars
offset = width/2

stop_bar = ax.bar(x-offset, stop_means, width, color='lightsteelblue')
ax.bar_label(stop_bar, stop_means, padding=-25, rotation=90)

stem_bar = ax.bar(x+offset, stem_means, width, color='orange')
ax.bar_label(stem_bar, stem_means, padding=-25, rotation=90)

ax.set_ylabel('Median')
ax.set_title('Reduction rates')
plt.xticks(x, types, rotation='vertical')
plt.legend(["Stopword", "Stemming"])

plt.show()


## Cleaned dataset

In [None]:
import pandas as pd
import seaborn as sns

# load cleaned dataset
src = 'data/995,000_rows_cleaned.csv'
clean_data = pd.read_csv(src, index_col=0)

### Empty cells

In [None]:
# amount of empty cells in dataset
sns.heatmap(clean_data.isnull(), cbar=False)
clean_data.isnull().sum()

### Destribution of types

In [None]:
# percentage destribution of types
type_dist = clean_data['type'].value_counts(normalize=True) * 100
print(type_dist)

### Count of URLs in fake and reliable articles

In [None]:
import re

# Function to count URLs with <URL> tag
def count_urls_with_tag(text):
    urls_with_tag = re.findall('_url_', text)
    return len(urls_with_tag)

# Apply count URLs with <URL> tag
clean_data['url_count'] = clean_data['content_clean'].apply(count_urls_with_tag)

# print(df[['content_clean', 'url_count_with_tag']])

# Total URLs in Content
total_urls = clean_data['url_count'].sum()
print("Total URLs in Content:", total_urls)

# Filter DataFrame for articles labeled as 'fake'
fake_articles = clean_data[clean_data['type'] == 'fake']

# Total URLs in 'fake' content
total_fake_urls = fake_articles['url_count'].sum()
print("Total URLs in 'fake' content:", total_fake_urls)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = clean_data[clean_data['type'] == 'reliable']

# Total URLs in 'reliable' content
total_reliable_urls = reliable_articles['url_count'].sum()
print("Total URLs in 'reliable' content:", total_reliable_urls)

# Minimum number of URLs in 'fake' content
min_fake_urls = fake_articles['url_count'].min()
print("Minimum number of URLs in 'fake' content:", min_fake_urls)

# Maximum number of URLs in 'fake' content
max_fake_urls = fake_articles['url_count'].max()
print("Maximum number of URLs in 'fake' content:", max_fake_urls)

# Mean number of URLs in 'fake' content
mean_fake_urls = fake_articles['url_count'].mean()
print("Mean number of URLs in 'fake' content:", mean_fake_urls)

# Minimum number of URLs in 'reliable' content
min_reliable_urls = reliable_articles['url_count'].min()
print("Minimum number of URLs in 'reliable' content:", min_reliable_urls)

# Maximum number of URLs in 'reliable' content
max_reliable_urls = reliable_articles['url_count'].max()
print("Maximum number of URLs in 'reliable' content:", max_reliable_urls)

# Mean number of URLs in 'reliable' content
mean_reliable_urls = reliable_articles['url_count'].mean()
print("Mean number of URLs in 'reliable' content:", mean_reliable_urls)


### Count of dates in fake and reliable articles

In [None]:
# Function to count DATEs with <DATE> tag
def count_date_with_tag(text):
    date_with_tag = re.findall('_date_', text)
    return len(date_with_tag)

# Apply count DATEs with <DATE> tag
clean_data['date_count'] = clean_data['content_clean'].apply(count_date_with_tag)

# Total DATEs in Content
total_dates = clean_data['date_count'].sum()
print("Total DATEs in Content:", total_dates)

# Filter DataFrame for articles labeled as 'fake'
fake_articles = clean_data[clean_data['type'] == 'fake']

# Total DATEs in 'fake' content
total_fake_dates = fake_articles['date_count'].sum()
print("Total DATEs in 'fake' content:", total_fake_dates)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = clean_data[clean_data['type'] == 'reliable']

# Total URLs in 'reliable' content
total_reliable_dates = reliable_articles['date_count'].sum()
print("Total DATEs in 'reliable' content:", total_reliable_dates)

# Minimum number of DATEs in 'fake' content
min_fake_dates = fake_articles['date_count'].min()
print("Minimum number of DATEs in 'fake' content:", min_fake_dates)

# Maximum number of DATEs in 'fake' content
max_fake_dates = fake_articles['date_count'].max()
print("Maximum number of DATEs in 'fake' content:", max_fake_dates)

# Mean number of URLs in 'fake' content
mean_fake_dates = fake_articles['date_count'].mean()
print("Mean number of DATEs in 'fake' content:", mean_fake_dates)

# Minimum number of URLs in 'reliable' content
min_reliable_dates = reliable_articles['date_count'].min()
print("Minimum number of DATEs in 'reliable' content:", min_reliable_dates)

# Maximum number of URLs in 'reliable' content
max_reliable_dates = reliable_articles['date_count'].max()
print("Maximum number of DATEs in 'reliable' content:", max_reliable_dates)

# Mean number of URLs in 'reliable' content
mean_reliable_dates = reliable_articles['date_count'].mean()
print("Mean number of DATEs in 'reliable' content:", mean_reliable_dates)

### Count of NUMs in fake and reliable articles

In [None]:

# Function to count NUMs with <NUM> tag
def count_num_with_tag(text):
    num_with_tag = re.findall('_num_', text)
    return len(num_with_tag)

# Apply count NUms with <NUM> tag
clean_data['num_count'] = clean_data['content_clean'].apply(count_num_with_tag)

# Total NUMs in Content
total_nums = clean_data['num_count'].sum()
print("Total NUMs in Content:", total_nums)

# Filter DataFrame for articles labeled as 'fake'
fake_articles = clean_data[clean_data['type'] == 'fake']

# add more type
#fake_articles = clean_data[(clean_data['type'] == 'fake') | (clean_data['type'] == '')]

# Total NUMs in 'fake' content
total_fake_nums = fake_articles['num_count'].sum()
print("Total NUMs in 'fake' content:", total_fake_nums)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = clean_data[clean_data['type'] == 'reliable']

# Total NUMs in 'reliable' content
total_reliable_nums = reliable_articles['num_count'].sum()
print("Total NUMs in 'reliable' content:", total_reliable_nums)

# Minimum number of NUMs in 'fake' content
min_fake_nums = fake_articles['num_count'].min()
print("Minimum number of NUMs in 'fake' content:", min_fake_nums)

# Maximum number of NUMs in 'fake' content
max_fake_nums = fake_articles['num_count'].max()
print("Maximum number of NUMs in 'fake' content:", max_fake_nums)

# Mean number of NUMs in 'fake' content
mean_fake_nums = fake_articles['num_count'].mean()
print("Mean number of NUMs in 'fake' content:", mean_fake_nums)

# Minimum number of URLs in 'reliable' content
min_reliable_nums = reliable_articles['num_count'].min()
print("Minimum number of NUMs in 'reliable' content:", min_reliable_nums)

# Maximum number of URLs in 'reliable' content
max_reliable_nums = reliable_articles['num_count'].max()
print("Maximum number of NUMs in 'reliable' content:", max_reliable_nums)

# Mean number of URLs in 'reliable' content
mean_reliable_nums = reliable_articles['num_count'].mean()
print("Mean number of NUMs in 'reliable' content:", mean_reliable_nums)


### Barplot for 10000 most frequent words, after cleaning

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Initialize an empty counter to store word frequencies
clean_word_freq = Counter()

# Iterate over each row of the DataFrame
for _, row in clean_data.iterrows():
    # Join the clean words in the 'content_clean' column of the current row into a single string
    clean_text = ' '.join(re.findall(r'\b\w+\b', row['content_clean']))
    
    # Count the word frequencies for the current row
    clean_word_freq.update(clean_text.split())

# Sort the word frequencies in descending order
sorted_clean_word_freq = sorted(clean_word_freq.items(), key=lambda x: x[1], reverse=True)

# Extract the 100 most frequent words
top_100_clean_words = sorted_clean_word_freq[:100]

# Print the top 100 most frequent words
print(top_100_clean_words)

# Extract the 10000 most frequent words
top_10000_clean_words = sorted_clean_word_freq[:10000]

# Barplot for top 10000 most frequent clean words
plt.figure(figsize=(15, 6))
words, frequencies = zip(*top_10000_clean_words)
plt.bar(words, frequencies)
plt.yscale('log')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10000 Most Frequent Clean Words')
plt.xticks([])
plt.show()

### Barplot for 10000 most frequent words after stemming

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Initialize an empty counter to store word frequencies
clean_word_freq_after = Counter()

# Iterate over each row of the DataFrame
for _, row in clean_data.iterrows():
    # Join the clean words in the 'content_clean' column of the current row into a single string
    clean_text_after = ' '.join(re.findall(r'\b\w+\b', row['content_stem']))
    
    # Count the word frequencies for the current row
    clean_word_freq_after.update(clean_text_after.split())

# Sort in descending order
sorted_clean_word_freq_after = sorted(clean_word_freq_after.items(), key=lambda x: x[1], reverse=True)

# Extract the 100 most frequent words
top_100_clean_words_after = sorted_clean_word_freq_after[:100]

for word, frequency in top_100_clean_words_after:
    print(f"{word}: {frequency}")

# Extract the 10000 most frequent words
top_10000_clean_words_after = sorted_clean_word_freq_after[:10000]

# Barplot for top 10000 most frequent clean words
plt.figure(figsize=(15, 6))
words, frequencies = zip(*top_10000_clean_words_after)
plt.bar(words, frequencies)
plt.yscale('log')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10000 Most Frequent Clean Words After Preprocessing')
plt.xticks([])
plt.show()

### Amount of ',' (commas) in fake news vs. reliable news

In [None]:
# Function to count comma 
def count_comma(text):
    return text.count(',')

# Apply
clean_data['comma_count'] = clean_data['content_clean'].apply(count_comma)


In [None]:
#  Filter DataFrame for articles labeled as 'fake'
fake_articles = clean_data[clean_data['type'] == 'fake']

# Total commas in 'fake' content
total_fake_nums = fake_articles['comma_count'].sum()
print("Total commas in 'fake' content:", total_fake_nums)

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = clean_data[clean_data['type'] == 'reliable']

# Total commas in 'reliable' content
total_reliable_nums = reliable_articles['comma_count'].sum()
print("Total commas in 'reliable' content:", total_reliable_nums)

# Minimum number of commas in 'fake' content
min_fake_nums = fake_articles['comma_count'].min()
print("Minimum number of commas in 'fake' content:", min_fake_nums)

# Maximum number of commas in 'fake' content
max_fake_nums = fake_articles['comma_count'].max()
print("Maximum number of commas in 'fake' content:", max_fake_nums)

# Mean number of commas in 'fake' content
mean_fake_nums = fake_articles['comma_count'].mean()
print("Mean number of commas in 'fake' content:", mean_fake_nums)

# Minimum number of commas in 'reliable' content
min_reliable_nums = reliable_articles['comma_count'].min()
print("Minimum number of commas in 'reliable' content:", min_reliable_nums)

# Maximum number of commas in 'reliable' content
max_reliable_nums = reliable_articles['comma_count'].max()
print("Maximum number of commas in 'reliable' content:", max_reliable_nums)

# Mean number of commas in 'reliable' content
mean_reliable_nums = reliable_articles['comma_count'].mean()
print("Mean number of commas in 'reliable' content:", mean_reliable_nums)

In [None]:
import matplotlib.pyplot as plt

# get total sum of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (clean_data[ (clean_data['type'] == type)])['comma_count'].median()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('median')
ax.set_title('\',\' characters in all article types')

ax.bar(types, sums)

plt.show()

### Length of sentences in reliable news vs. fake news

In [None]:
import swifter

def average_sentence_length(text):
    # Split the text into sentences
    sentences = text.split('.')
    
    # Initialize variables to store total length and number of sentences
    total_length = 0
    num_sentences = 0
    
    # Iterate through each sentence to calculate total length and count the number of sentences
    for sentence in sentences:
        # Count the number of words in the sentence
        words = sentence.split()
        length = len(words)
        
        # Add the length of the current sentence to the total length
        total_length += length
        
        # Increment the number of sentences
        if length > 0:  # Exclude empty sentences
            num_sentences += 1
    
    # Calculate the average length of sentences
    if num_sentences > 0:
        average_length = total_length / num_sentences
    else:
        average_length = 0
    
    return average_length

# Apply
clean_data['average_sentence_length'] = clean_data['content'].swifter.apply(average_sentence_length)

In [None]:
# Filter DataFrame for articles labeled as 'fake'
fake_articles = clean_data[clean_data['type'] == 'fake']

# Filter DataFrame for articles labeled as 'reliable'
reliable_articles = clean_data[clean_data['type'] == 'reliable']

# Minimum number in 'fake' content
min_fake_nums = fake_articles['average_sentence_length'].min()
print("Minimum average length in 'fake' content:", min_fake_nums)

# Maximum number in 'fake' content
max_fake_nums = fake_articles['average_sentence_length'].max()
print("Maximum average length in 'fake' content:", max_fake_nums)

# Mean number in 'fake' content
mean_fake_nums = fake_articles['average_sentence_length'].mean()
print("Mean average length in 'fake' content:", mean_fake_nums)

# Minimum number in 'reliable' content
min_reliable_nums = reliable_articles['average_sentence_length'].min()
print("Minimum average length in 'reliable' content:", min_reliable_nums)

# Maximum number in 'reliable' content
max_reliable_nums = reliable_articles['average_sentence_length'].max()
print("Maximum average length in 'reliable' content:", max_reliable_nums)

# Mean number of in 'reliable' content
mean_reliable_nums = reliable_articles['average_sentence_length'].mean()
print("Mean average length in 'reliable' content:", mean_reliable_nums)

In [None]:
reliable_articles['average_sentence_length'].median()

In [None]:
import matplotlib.pyplot as plt

# get total sum of exclamation points for each type (labels)
types = ['reliable',
         'political',
         'bias',
         'fake',
         'conspiracy',
         'rumor',
         'unknown',
         'unreliable',
         'clickbait',
         'junksci',
         'satire',
         'hate'
         ]

sums = []
for type in types:
    sum = (clean_data[ (clean_data['type'] == type)])['average_sentence_length'].median()
    sums.append(sum)

# plot data
fig, ax = plt.subplots()
plt.xticks(rotation='vertical')

ax.set_ylabel('median')
ax.set_title('Avarage sentence length')

ax.bar(types, sums)

plt.show()

### Do Fake news have less titles then reliable news? 

In [None]:
title_counts = clean_data.groupby('type')['title'].apply(lambda x: x.notnull().mean())
print(title_counts)

### Compare FakeNews datasets and LIAR test dataset

In [None]:
import pandas as pd


# Load datasets

src = 'data/training_data_features.csv'
training_data = pd.read_csv(src)

src_validation = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src_validation)

src_test = "data/test_data_features.csv"
test_data_fake_news = pd.read_csv(src_test)

src_test = "data/liar_dataset/test_features.csv"
test_data_liar = pd.read_csv(src_test)

In [None]:
import swifter

# Function to count words in 'content'

def word_count(text):

    words = text.split() 
    
    return len(words)

training_data['word_count'] = training_data['content'].swifter.apply(word_count)
test_data_fake_news['word_count'] = test_data_fake_news['content'].swifter.apply(word_count)
test_data_liar['word_count'] = test_data_liar['content'].swifter.apply(word_count)
validation_data['word_count'] = validation_data['content'].swifter.apply(word_count)

In [None]:
import matplotlib.pyplot as plt

# Median word counts
training_median = training_data['word_count'].median()
validation_median = validation_data['word_count'].median()
test_median_fake_news = test_data_fake_news['word_count'].median()
test_median_liar = test_data_liar['word_count'].median()

# Plotting
plt.figure(figsize=(5, 1))
bars = plt.barh(['Training - FakeNews', 'Validation - FakeNews', 'Test - FakeNews' , 'Test - LIAR'], [training_median, validation_median, test_median_fake_news, test_median_liar])
plt.title('Median Word Count')
plt.show()