In [1]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance


import json
import matplotlib.pyplot as plt

In [2]:
# label: true = 0, fake = 1

In [3]:
experiment_name = "zephyr_10k"
fake_train_dataset_df = pd.read_json(f"fake_true_datasets/fake_true_dataset_{experiment_name}_train.json")

In [4]:
fake_train_dataset_df

Unnamed: 0,text,label
0,[Four groups that advocate for immigrant right...,1
1,[Four groups that advocate for immigrant right...,0
2,[Former Vice President Dick Cheney on Sunday d...,0
3,[Former Vice President Dick Cheney on Sunday d...,1
4,[Space shuttle Discovery launched just before ...,0
...,...,...
15827,[President Obama said Thursday that watching t...,0
15828,[President Obama said Thursday that watching t...,1
15829,[Pope Benedict XVI completed his eight-day tou...,1
15830,[Pope Benedict XVI completed his eight-day tou...,0


In [5]:
fake_train_dataset_df.iloc[0]["text"][0]

'Four groups that advocate for immigrant rights said Thursday they will file a lawsuit against the Trump administration\'s decision to end temporary protected status (TPS) for thousands of Hondurans living in the United States. The groups, including the American Civil Liberties Union (ACLU), the National Network for Immigrant and Refugee Rights, the Center for Constitutional Rights, and Catholic Legal Immigration Network, Inc., claim that the administration\'s decision is unlawful and will cause "s'

In [6]:
fake_train_dataset_df["text_sentences"] = fake_train_dataset_df["text"].apply(lambda x: x[0].split("."))

fake_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 1]
true_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 0]

In [7]:
fake_train_dataset_df["text_sentences"][0]

["Four groups that advocate for immigrant rights said Thursday they will file a lawsuit against the Trump administration's decision to end temporary protected status (TPS) for thousands of Hondurans living in the United States",
 ' The groups, including the American Civil Liberties Union (ACLU), the National Network for Immigrant and Refugee Rights, the Center for Constitutional Rights, and Catholic Legal Immigration Network, Inc',
 ', claim that the administration\'s decision is unlawful and will cause "s']

In [8]:
print(f"Average number of sentences in fake texts: {np.mean(fake_texts_df['text_sentences'].apply(len))}")
print(f"Average number of sentences in true texts: {np.mean(true_texts_df['text_sentences'].apply(len))}")

Average number of sentences in fake texts: 4.193784739767559
Average number of sentences in true texts: 5.235851440121273


In [9]:
# add column: number of "the" in text
fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))

print(f"Average number of 'the' in fake texts: {np.mean(fake_texts_df['the_count'])}")
print(f"Average number of 'the' in true texts: {np.mean(true_texts_df['the_count'])}")

Average number of 'the' in fake texts: 5.431783729156139
Average number of 'the' in true texts: 5.1379484588175846


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))


In [10]:
fake_train_dataset_full_text = " ".join([text for text in fake_train_dataset_df["text"].apply(lambda x: x[0])])

In [11]:
len(fake_train_dataset_full_text)

7931831

In [12]:
# run through all the texts and count occurence of each characters in unicode representation

def count_chars(texts):
    char_counts = {}
    for text in texts:
        for char in text:
            if char in char_counts:
                char_counts[char] += 1
            else:
                char_counts[char] = 1
    return char_counts

fake_char_counts = count_chars(fake_train_dataset_full_text)

In [13]:
fake_char_counts_sorted = dict(sorted(fake_char_counts.items(), key=lambda item: item[1], reverse=True))
fake_char_counts_sorted

{' ': 1302604,
 'e': 747661,
 'a': 557568,
 't': 508687,
 'i': 472271,
 'n': 465500,
 'o': 449433,
 'r': 408071,
 's': 399941,
 'h': 288142,
 'd': 254000,
 'l': 245222,
 'c': 196420,
 'u': 160033,
 'm': 142476,
 'f': 133386,
 'g': 127610,
 'p': 116290,
 'y': 101791,
 'w': 96959,
 'b': 79695,
 ',': 77356,
 'v': 62244,
 '.': 58813,
 'k': 44521,
 'T': 31990,
 'S': 25570,
 'A': 24307,
 '-': 21981,
 'C': 19939,
 "'": 19292,
 '"': 17607,
 'M': 17157,
 '0': 15723,
 'I': 14906,
 'B': 13230,
 'P': 12957,
 '1': 11526,
 'N': 11295,
 'x': 10451,
 'F': 10206,
 'H': 10196,
 'W': 9757,
 '2': 9443,
 'D': 9414,
 'R': 8260,
 'j': 7856,
 'z': 7789,
 'L': 7648,
 'J': 7256,
 'O': 7205,
 'G': 7105,
 'U': 6601,
 'E': 5635,
 'q': 5105,
 'K': 5062,
 '3': 4253,
 '9': 4204,
 '5': 4009,
 '4': 3539,
 '6': 3207,
 '7': 3173,
 'V': 3038,
 '8': 2942,
 '(': 2295,
 ')': 2248,
 'Y': 2065,
 ':': 1876,
 '$': 1211,
 'Z': 983,
 '?': 710,
 'Q': 660,
 '%': 341,
 'X': 269,
 ';': 255,
 '!': 205,
 '&': 187,
 '/': 184,
 '[': 144,


In [14]:
# convert keys in char_counts to unicode
fake_char_counts_unicode = {ord(k): v for k, v in fake_char_counts_sorted.items()}
fake_char_counts_unicode

{32: 1302604,
 101: 747661,
 97: 557568,
 116: 508687,
 105: 472271,
 110: 465500,
 111: 449433,
 114: 408071,
 115: 399941,
 104: 288142,
 100: 254000,
 108: 245222,
 99: 196420,
 117: 160033,
 109: 142476,
 102: 133386,
 103: 127610,
 112: 116290,
 121: 101791,
 119: 96959,
 98: 79695,
 44: 77356,
 118: 62244,
 46: 58813,
 107: 44521,
 84: 31990,
 83: 25570,
 65: 24307,
 45: 21981,
 67: 19939,
 39: 19292,
 34: 17607,
 77: 17157,
 48: 15723,
 73: 14906,
 66: 13230,
 80: 12957,
 49: 11526,
 78: 11295,
 120: 10451,
 70: 10206,
 72: 10196,
 87: 9757,
 50: 9443,
 68: 9414,
 82: 8260,
 106: 7856,
 122: 7789,
 76: 7648,
 74: 7256,
 79: 7205,
 71: 7105,
 85: 6601,
 69: 5635,
 113: 5105,
 75: 5062,
 51: 4253,
 57: 4204,
 53: 4009,
 52: 3539,
 54: 3207,
 55: 3173,
 86: 3038,
 56: 2942,
 40: 2295,
 41: 2248,
 89: 2065,
 58: 1876,
 36: 1211,
 90: 983,
 63: 710,
 81: 660,
 37: 341,
 88: 269,
 59: 255,
 33: 205,
 38: 187,
 47: 184,
 91: 144,
 93: 136,
 233: 94,
 163: 57,
 225: 43,
 8226: 40,
 237:

In [15]:
# exclude from count all ascii characters, ie. all keys above 128
fake_char_counts_special = {k: v for k, v in fake_char_counts_sorted.items() if ord(k) > 128}
fake_char_counts_special

{'é': 94,
 '£': 57,
 'á': 43,
 '•': 40,
 'í': 29,
 '€': 27,
 'ñ': 24,
 'ó': 23,
 '½': 15,
 'ü': 14,
 'è': 10,
 '»': 10,
 'ö': 9,
 'ã': 7,
 'ú': 7,
 '°': 5,
 'ï': 5,
 'ä': 5,
 'ô': 4,
 '\xad': 3,
 'ø': 3,
 'ý': 3,
 'č': 2,
 'â': 2,
 'ð': 2,
 'Á': 2,
 'š': 2,
 'ç': 2,
 '¥': 2,
 'ć': 2,
 'ā': 1,
 'æ': 1,
 'à': 1,
 'ë': 1,
 'ê': 1,
 '®': 1,
 'ğ': 1,
 'Ş': 1,
 'É': 1,
 '’': 1,
 '–': 1,
 'å': 1,
 'Ø': 1}

In [19]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in fake_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 8444
Number of special apostrophes: 1


In [36]:
# convert form unicode to character
chr(8212)

'—'

In [38]:
chr(8211)

'–'