In [1]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance


import json
import matplotlib.pyplot as plt

In [2]:
# label: true = 0, fake = 1

In [3]:
experiment_name = "phi_10k"
fake_train_dataset_df = pd.read_json(f"fake_true_datasets/fake_true_dataset_{experiment_name}_train.json")

In [4]:
fake_train_dataset_df

Unnamed: 0,text,label
0,[Four groups that advocate for immigrant right...,0
1,[Four groups that advocate for immigrant right...,1
2,[Former Vice President Dick Cheney on Sunday d...,1
3,[Former Vice President Dick Cheney on Sunday d...,0
4,[Space shuttle Discovery launched just before ...,0
...,...,...
15717,[The Cyrus family is ready to rock and always ...,1
15718,[The Cyrus family is ready to rock and always ...,0
15719,"[Over the past month, we've watched from dista...",1
15720,"[Over the past month, we've watched from dista...",0


In [5]:
fake_train_dataset_df.iloc[0]["text"][0]

'Four groups that advocate for immigrant rights said Thursday they will challenge Arizona\'s new immigration law, which allows police to ask anyone for proof of legal U.S. residency. The Mexican American Legal Defense and Educational Fund, the American Civil Liberties Union, the ACLU of Arizona and the National Immigration Law Center held a news conference Thursday in Phoenix to announce the legal challenge. "The Arizona community can be assured that a vigorous and sophisticated legal challenge wi'

In [6]:
fake_train_dataset_df["text_sentences"] = fake_train_dataset_df["text"].apply(lambda x: x[0].split("."))

fake_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 1]
true_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 0]

In [7]:
fake_train_dataset_df["text_sentences"][0]

["Four groups that advocate for immigrant rights said Thursday they will challenge Arizona's new immigration law, which allows police to ask anyone for proof of legal U",
 'S',
 ' residency',
 ' The Mexican American Legal Defense and Educational Fund, the American Civil Liberties Union, the ACLU of Arizona and the National Immigration Law Center held a news conference Thursday in Phoenix to announce the legal challenge',
 ' "The Arizona community can be assured that a vigorous and sophisticated legal challenge wi']

In [8]:
print(f"Average number of sentences in fake texts: {np.mean(fake_texts_df['text_sentences'].apply(len))}")
print(f"Average number of sentences in true texts: {np.mean(true_texts_df['text_sentences'].apply(len))}")

Average number of sentences in fake texts: 5.479908443540183
Average number of sentences in true texts: 5.238101298040214


In [9]:
# add column: number of "the" in text
fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))

print(f"Average number of 'the' in fake texts: {np.mean(fake_texts_df['the_count'])}")
print(f"Average number of 'the' in true texts: {np.mean(true_texts_df['the_count'])}")

Average number of 'the' in fake texts: 6.546286876907426
Average number of 'the' in true texts: 5.1333672690251975


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))


In [10]:
fake_train_dataset_full_text = " ".join([text for text in fake_train_dataset_df["text"].apply(lambda x: x[0])])

In [11]:
len(fake_train_dataset_full_text)

7876721

In [12]:
# run through all the texts and count occurence of each characters in unicode representation

def count_chars(texts):
    char_counts = {}
    for text in texts:
        for char in text:
            if char in char_counts:
                char_counts[char] += 1
            else:
                char_counts[char] = 1
    return char_counts

fake_char_counts = count_chars(fake_train_dataset_full_text)

In [13]:
fake_char_counts_sorted = dict(sorted(fake_char_counts.items(), key=lambda item: item[1], reverse=True))
fake_char_counts_sorted

{' ': 1339973,
 'e': 744005,
 'a': 549388,
 't': 517748,
 'i': 451459,
 'o': 447215,
 'n': 444813,
 's': 393300,
 'r': 391136,
 'h': 301428,
 'd': 243676,
 'l': 240133,
 'c': 182298,
 'u': 154220,
 'm': 139909,
 'f': 132473,
 'g': 119184,
 'p': 112705,
 'w': 104346,
 'y': 103395,
 'b': 82524,
 ',': 71625,
 '.': 68533,
 'v': 58538,
 'k': 45139,
 'T': 34206,
 'S': 25665,
 'A': 24519,
 '-': 19558,
 'I': 18898,
 'C': 18644,
 '"': 18453,
 "'": 18003,
 '0': 16648,
 'M': 15816,
 'B': 13079,
 'P': 12450,
 '1': 11122,
 'N': 10992,
 'H': 10716,
 'W': 10163,
 'x': 10057,
 'F': 9919,
 '2': 9057,
 'D': 9014,
 'U': 8186,
 'j': 7790,
 'R': 7615,
 'O': 7101,
 'J': 6893,
 'L': 6883,
 'z': 6728,
 'G': 6427,
 'E': 5438,
 'q': 4854,
 'K': 4627,
 '9': 4076,
 '5': 4013,
 '3': 3845,
 '’': 3548,
 '4': 3135,
 '8': 2804,
 '7': 2797,
 '6': 2769,
 'V': 2739,
 'Y': 2271,
 ':': 2105,
 '“': 1964,
 '(': 1786,
 ')': 1746,
 '”': 1588,
 '$': 1502,
 '?': 1203,
 'Z': 911,
 'Q': 649,
 ';': 264,
 '/': 257,
 'X': 221,
 '!': 

In [14]:
# convert keys in char_counts to unicode
fake_char_counts_unicode = {ord(k): v for k, v in fake_char_counts_sorted.items()}
fake_char_counts_unicode

{32: 1339973,
 101: 744005,
 97: 549388,
 116: 517748,
 105: 451459,
 111: 447215,
 110: 444813,
 115: 393300,
 114: 391136,
 104: 301428,
 100: 243676,
 108: 240133,
 99: 182298,
 117: 154220,
 109: 139909,
 102: 132473,
 103: 119184,
 112: 112705,
 119: 104346,
 121: 103395,
 98: 82524,
 44: 71625,
 46: 68533,
 118: 58538,
 107: 45139,
 84: 34206,
 83: 25665,
 65: 24519,
 45: 19558,
 73: 18898,
 67: 18644,
 34: 18453,
 39: 18003,
 48: 16648,
 77: 15816,
 66: 13079,
 80: 12450,
 49: 11122,
 78: 10992,
 72: 10716,
 87: 10163,
 120: 10057,
 70: 9919,
 50: 9057,
 68: 9014,
 85: 8186,
 106: 7790,
 82: 7615,
 79: 7101,
 74: 6893,
 76: 6883,
 122: 6728,
 71: 6427,
 69: 5438,
 113: 4854,
 75: 4627,
 57: 4076,
 53: 4013,
 51: 3845,
 8217: 3548,
 52: 3135,
 56: 2804,
 55: 2797,
 54: 2769,
 86: 2739,
 89: 2271,
 58: 2105,
 8220: 1964,
 40: 1786,
 41: 1746,
 8221: 1588,
 36: 1502,
 63: 1203,
 90: 911,
 81: 649,
 59: 264,
 47: 257,
 88: 221,
 33: 212,
 38: 171,
 37: 155,
 8211: 134,
 91: 106,
 93

In [15]:
# exclude from count all ascii characters, ie. all keys above 128
fake_char_counts_special = {k: v for k, v in fake_char_counts_sorted.items() if ord(k) > 128}
fake_char_counts_special

{'’': 3548,
 '“': 1964,
 '”': 1588,
 '–': 134,
 'é': 99,
 '£': 85,
 '—': 50,
 '•': 50,
 '‘': 48,
 'ó': 34,
 'á': 33,
 '€': 25,
 'ñ': 22,
 '½': 17,
 'í': 17,
 '»': 13,
 'ü': 11,
 'ã': 10,
 'ö': 8,
 'ä': 8,
 '°': 7,
 'è': 7,
 'ć': 7,
 'Á': 7,
 'ç': 4,
 'à': 4,
 'ð': 4,
 'ë': 4,
 '…': 4,
 'ō': 4,
 '\xad': 3,
 'É': 3,
 'ú': 3,
 'ŏ': 3,
 'â': 2,
 'ń': 2,
 'ư': 2,
 'ờ': 2,
 'ï': 2,
 'ž': 2,
 '¥': 2,
 'ô': 1,
 'Č': 1,
 'æ': 1,
 'î': 1,
 'Þ': 1,
 '®': 1,
 'Ó': 1,
 '春': 1,
 '日': 1,
 '昭': 1,
 '和': 1,
 '平': 1,
 '成': 1,
 'ý': 1,
 'š': 1,
 'ě': 1,
 'ğ': 1,
 'Ł': 1,
 '₨': 1,
 '¡': 1,
 'ø': 1,
 'å': 1,
 'ş': 1,
 'ă': 1,
 'ţ': 1,
 'ê': 1}

In [19]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in fake_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 7234
Number of special apostrophes: 3548


In [36]:
# convert form unicode to character
chr(8212)

'—'

In [38]:
chr(8211)

'–'