In [1]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance


import json
import matplotlib.pyplot as plt

In [2]:
# label: true = 0, fake = 1

In [3]:
experiment_name = "gemma_10k"
fake_train_dataset_df = pd.read_json(f"fake_true_datasets/fake_true_dataset_{experiment_name}_train.json")

In [4]:
fake_train_dataset_df

Unnamed: 0,text,label
0,[Four groups that advocate for immigrant right...,1
1,[Four groups that advocate for immigrant right...,0
2,[Former Vice President Dick Cheney on Sunday d...,1
3,[Former Vice President Dick Cheney on Sunday d...,0
4,[Space shuttle Discovery launched just before ...,0
...,...,...
15555,[South Africa pace bowler Dale Steyn ripped th...,0
15556,"[In a bustling room full of computers, giant w...",1
15557,"[In a bustling room full of computers, giant w...",0
15558,[President Obama said Thursday that watching t...,0


In [5]:
fake_train_dataset_df.iloc[0]["text"][0]

'Four groups that advocate for immigrant rights said Thursday they are suing the Trump administration over its new policy that blocks federal funding for legal representation to immigrants facing deportation. The groups, including the American Civil Liberties Union and the National Immigration Law Center, said they are filing a lawsuit in the U.S. District Court in Los Angeles. The lawsuit alleges that the Trump administration’s policy violates the 14th Amendment, which guarantees equal protectio'

In [6]:
fake_train_dataset_df["text_sentences"] = fake_train_dataset_df["text"].apply(lambda x: x[0].split("."))

fake_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 1]
true_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 0]

In [7]:
fake_train_dataset_df["text_sentences"][0]

['Four groups that advocate for immigrant rights said Thursday they are suing the Trump administration over its new policy that blocks federal funding for legal representation to immigrants facing deportation',
 ' The groups, including the American Civil Liberties Union and the National Immigration Law Center, said they are filing a lawsuit in the U',
 'S',
 ' District Court in Los Angeles',
 ' The lawsuit alleges that the Trump administration’s policy violates the 14th Amendment, which guarantees equal protectio']

In [8]:
print(f"Average number of sentences in fake texts: {np.mean(fake_texts_df['text_sentences'].apply(len))}")
print(f"Average number of sentences in true texts: {np.mean(true_texts_df['text_sentences'].apply(len))}")

Average number of sentences in fake texts: 5.834340059118365
Average number of sentences in true texts: 5.230492351201954


In [9]:
# add column: number of "the" in text
fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))

print(f"Average number of 'the' in fake texts: {np.mean(fake_texts_df['the_count'])}")
print(f"Average number of 'the' in true texts: {np.mean(true_texts_df['the_count'])}")

Average number of 'the' in fake texts: 6.550186351368719
Average number of 'the' in true texts: 5.141149247975318


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))


In [10]:
fake_train_dataset_full_text = " ".join([text for text in fake_train_dataset_df["text"].apply(lambda x: x[0])])

In [11]:
len(fake_train_dataset_full_text)

7795559

In [12]:
# run through all the texts and count occurence of each characters in unicode representation

def count_chars(texts):
    char_counts = {}
    for text in texts:
        for char in text:
            if char in char_counts:
                char_counts[char] += 1
            else:
                char_counts[char] = 1
    return char_counts

fake_char_counts = count_chars(fake_train_dataset_full_text)

In [13]:
fake_char_counts_sorted = dict(sorted(fake_char_counts.items(), key=lambda item: item[1], reverse=True))
fake_char_counts_sorted

{' ': 1324564,
 'e': 730749,
 'a': 544007,
 't': 502372,
 'i': 441080,
 'o': 439503,
 'n': 434506,
 'r': 385742,
 's': 384450,
 'h': 296529,
 'd': 243353,
 'l': 235028,
 'c': 173397,
 'u': 148387,
 'm': 137884,
 'f': 129327,
 'g': 117094,
 'p': 108261,
 'w': 105319,
 'y': 103132,
 'b': 80649,
 ',': 72646,
 '.': 70525,
 'v': 56267,
 'k': 44836,
 'T': 35619,
 'S': 27170,
 'A': 24461,
 '-': 23031,
 '"': 20058,
 'I': 19869,
 'C': 19799,
 '0': 19684,
 "'": 18914,
 'M': 17583,
 '1': 15139,
 'B': 13677,
 'P': 13363,
 '2': 12242,
 'N': 11335,
 'H': 11250,
 'W': 10941,
 'F': 10207,
 'D': 9551,
 'x': 9324,
 'R': 8289,
 'L': 7812,
 'J': 7761,
 'U': 7572,
 'j': 7570,
 'O': 7258,
 'G': 7127,
 'z': 6850,
 'E': 5611,
 '3': 5411,
 '9': 5318,
 '5': 5106,
 'K': 5070,
 'q': 4692,
 '4': 4453,
 '<': 4300,
 '>': 4250,
 '’': 3795,
 '6': 3670,
 '7': 3608,
 '8': 3410,
 'V': 2918,
 'Y': 2529,
 '/': 2355,
 ':': 2270,
 '“': 2099,
 '”': 1680,
 '(': 1662,
 ')': 1607,
 '$': 1365,
 '?': 991,
 'Z': 973,
 'Q': 667,
 '*

In [14]:
# convert keys in char_counts to unicode
fake_char_counts_unicode = {ord(k): v for k, v in fake_char_counts_sorted.items()}
fake_char_counts_unicode

{32: 1324564,
 101: 730749,
 97: 544007,
 116: 502372,
 105: 441080,
 111: 439503,
 110: 434506,
 114: 385742,
 115: 384450,
 104: 296529,
 100: 243353,
 108: 235028,
 99: 173397,
 117: 148387,
 109: 137884,
 102: 129327,
 103: 117094,
 112: 108261,
 119: 105319,
 121: 103132,
 98: 80649,
 44: 72646,
 46: 70525,
 118: 56267,
 107: 44836,
 84: 35619,
 83: 27170,
 65: 24461,
 45: 23031,
 34: 20058,
 73: 19869,
 67: 19799,
 48: 19684,
 39: 18914,
 77: 17583,
 49: 15139,
 66: 13677,
 80: 13363,
 50: 12242,
 78: 11335,
 72: 11250,
 87: 10941,
 70: 10207,
 68: 9551,
 120: 9324,
 82: 8289,
 76: 7812,
 74: 7761,
 85: 7572,
 106: 7570,
 79: 7258,
 71: 7127,
 122: 6850,
 69: 5611,
 51: 5411,
 57: 5318,
 53: 5106,
 75: 5070,
 113: 4692,
 52: 4453,
 60: 4300,
 62: 4250,
 8217: 3795,
 54: 3670,
 55: 3608,
 56: 3410,
 86: 2918,
 89: 2529,
 47: 2355,
 58: 2270,
 8220: 2099,
 8221: 1680,
 40: 1662,
 41: 1607,
 36: 1365,
 63: 991,
 90: 973,
 81: 667,
 42: 370,
 59: 283,
 88: 260,
 33: 252,
 38: 169,
 9

In [15]:
# exclude from count all ascii characters, ie. all keys above 128
fake_char_counts_special = {k: v for k, v in fake_char_counts_sorted.items() if ord(k) > 128}
fake_char_counts_special

{'’': 3795,
 '“': 2099,
 '”': 1680,
 'é': 76,
 '‘': 75,
 '£': 62,
 '•': 55,
 '–': 28,
 'â': 22,
 '€': 21,
 'á': 19,
 'ñ': 19,
 '—': 18,
 '½': 14,
 'ó': 11,
 'ö': 11,
 '\u200b': 11,
 'í': 10,
 '»': 9,
 'ä': 7,
 '°': 6,
 'ã': 6,
 'ü': 5,
 'ú': 4,
 'è': 4,
 '¥': 4,
 '↑': 4,
 '™': 3,
 'ô': 3,
 '\xad': 3,
 'µ': 2,
 'Á': 2,
 'č': 2,
 'ï': 2,
 'ø': 2,
 'Č': 1,
 'ě': 1,
 'ę': 1,
 'ł': 1,
 'à': 1,
 'ć': 1,
 'ë': 1,
 '®': 1,
 'ž': 1,
 'ș': 1,
 'É': 1,
 '高': 1,
 '橋': 1,
 '秀': 1,
 '行': 1,
 'å': 1,
 'æ': 1,
 '￡': 1,
 '″': 1,
 'ř': 1}

In [19]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in fake_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 8252
Number of special apostrophes: 3795


In [36]:
# convert form unicode to character
chr(8212)

'—'

In [38]:
chr(8211)

'–'