In [29]:
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict, concatenate_datasets
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance


import json
import matplotlib.pyplot as plt

In [30]:
# label: true = 0, fake = 1

In [31]:
experiment_name = "mistral_10k"
fake_train_dataset_df = pd.read_json(f"fake_true_datasets/fake_true_dataset_{experiment_name}_train.json")

In [32]:
fake_train_dataset_df

Unnamed: 0,text,label
0,[Four groups that advocate for immigrant right...,1
1,[Four groups that advocate for immigrant right...,0
2,[Former Vice President Dick Cheney on Sunday d...,1
3,[Former Vice President Dick Cheney on Sunday d...,0
4,[Space shuttle Discovery launched just before ...,0
...,...,...
15753,[A Pablo Picasso sketchbook with 33 pencil dra...,1
15754,[A Pablo Picasso sketchbook with 33 pencil dra...,0
15755,"[At a time when she really needed a miracle, A...",0
15756,"[At a time when she really needed a miracle, A...",1


In [33]:
fake_train_dataset_df.iloc[0]["text"][0]

'Four groups that advocate for immigrant rights said Thursday they will not attend a planned rally against the construction of a mosque in the city of Murfreesboro, Tenn., because the organizers are not willing to denounce a group that is trying to stop the building of the mosque. “We’re going to stand with our friends and neighbors in Murfreesboro and be there with them on Saturday,” said a statement issued by the Tennessee Immigrant and Refugee Rights Coalition, the Tennessee Immigrant and Refu'

In [34]:
fake_train_dataset_df["text_sentences"] = fake_train_dataset_df["text"].apply(lambda x: x[0].split("."))

fake_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 1]
true_texts_df = fake_train_dataset_df[fake_train_dataset_df["label"] == 0]

In [35]:
fake_train_dataset_df["text_sentences"][0]

['Four groups that advocate for immigrant rights said Thursday they will not attend a planned rally against the construction of a mosque in the city of Murfreesboro, Tenn',
 ', because the organizers are not willing to denounce a group that is trying to stop the building of the mosque',
 ' “We’re going to stand with our friends and neighbors in Murfreesboro and be there with them on Saturday,” said a statement issued by the Tennessee Immigrant and Refugee Rights Coalition, the Tennessee Immigrant and Refu']

In [36]:
print(f"Average number of sentences in fake texts: {np.mean(fake_texts_df['text_sentences'].apply(len))}")
print(f"Average number of sentences in true texts: {np.mean(true_texts_df['text_sentences'].apply(len))}")

Average number of sentences in fake texts: 5.9761451592437504
Average number of sentences in true texts: 5.234860987685667


In [37]:
# add column: number of "the" in text
fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))

print(f"Average number of 'the' in fake texts: {np.mean(fake_texts_df['the_count'])}")
print(f"Average number of 'the' in true texts: {np.mean(true_texts_df['the_count'])}")

Average number of 'the' in fake texts: 6.4211394493084635
Average number of 'the' in true texts: 5.136854132283864


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_texts_df["the_count"] = fake_texts_df["text"].apply(lambda x: x[0].count("the"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_texts_df["the_count"] = true_texts_df["text"].apply(lambda x: x[0].count("the"))


In [38]:
fake_train_dataset_full_text = " ".join([text for text in fake_train_dataset_df["text"].apply(lambda x: x[0])])

In [39]:
len(fake_train_dataset_full_text)

7894757

In [40]:
# run through all the texts and count occurence of each characters in unicode representation

def count_chars(texts):
    char_counts = {}
    for text in texts:
        for char in text:
            if char in char_counts:
                char_counts[char] += 1
            else:
                char_counts[char] = 1
    return char_counts

fake_char_counts = count_chars(fake_train_dataset_full_text)

In [41]:
fake_char_counts_sorted = dict(sorted(fake_char_counts.items(), key=lambda item: item[1], reverse=True))
fake_char_counts_sorted

{' ': 1348618,
 'e': 739172,
 'a': 551880,
 't': 508465,
 'i': 448472,
 'o': 444178,
 'n': 441168,
 's': 389375,
 'r': 389325,
 'h': 300765,
 'd': 246826,
 'l': 238681,
 'c': 174237,
 'u': 151188,
 'm': 138377,
 'f': 129985,
 'g': 116896,
 'p': 110137,
 'w': 107191,
 'y': 103724,
 'b': 82004,
 ',': 72967,
 '.': 72575,
 'v': 57199,
 'k': 46381,
 'T': 35435,
 'S': 27507,
 'A': 25617,
 '-': 22738,
 'I': 20806,
 'C': 20415,
 '0': 20131,
 '"': 18907,
 "'": 17884,
 'M': 17780,
 '1': 15168,
 'B': 14120,
 'P': 13620,
 'H': 11952,
 '2': 11724,
 'N': 11677,
 'W': 11063,
 'F': 10453,
 'D': 9632,
 'x': 9603,
 'R': 8667,
 'L': 7926,
 'j': 7782,
 'U': 7602,
 'J': 7536,
 'G': 7330,
 'O': 7186,
 'z': 7163,
 'E': 5817,
 '3': 5537,
 '9': 5401,
 'K': 5371,
 '5': 5340,
 'q': 4803,
 '4': 4528,
 '’': 4527,
 '8': 3740,
 '7': 3734,
 '6': 3702,
 'V': 3081,
 '“': 2540,
 'Y': 2479,
 ':': 2294,
 '#': 1931,
 '”': 1908,
 '(': 1775,
 ')': 1737,
 '$': 1400,
 '?': 1105,
 'Z': 1011,
 'Q': 737,
 '>': 396,
 '/': 317,
 ';

In [42]:
# convert keys in char_counts to unicode
fake_char_counts_unicode = {ord(k): v for k, v in fake_char_counts_sorted.items()}
fake_char_counts_unicode

{32: 1348618,
 101: 739172,
 97: 551880,
 116: 508465,
 105: 448472,
 111: 444178,
 110: 441168,
 115: 389375,
 114: 389325,
 104: 300765,
 100: 246826,
 108: 238681,
 99: 174237,
 117: 151188,
 109: 138377,
 102: 129985,
 103: 116896,
 112: 110137,
 119: 107191,
 121: 103724,
 98: 82004,
 44: 72967,
 46: 72575,
 118: 57199,
 107: 46381,
 84: 35435,
 83: 27507,
 65: 25617,
 45: 22738,
 73: 20806,
 67: 20415,
 48: 20131,
 34: 18907,
 39: 17884,
 77: 17780,
 49: 15168,
 66: 14120,
 80: 13620,
 72: 11952,
 50: 11724,
 78: 11677,
 87: 11063,
 70: 10453,
 68: 9632,
 120: 9603,
 82: 8667,
 76: 7926,
 106: 7782,
 85: 7602,
 74: 7536,
 71: 7330,
 79: 7186,
 122: 7163,
 69: 5817,
 51: 5537,
 57: 5401,
 75: 5371,
 53: 5340,
 113: 4803,
 52: 4528,
 8217: 4527,
 56: 3740,
 55: 3734,
 54: 3702,
 86: 3081,
 8220: 2540,
 89: 2479,
 58: 2294,
 35: 1931,
 8221: 1908,
 40: 1775,
 41: 1737,
 36: 1400,
 63: 1105,
 90: 1011,
 81: 737,
 62: 396,
 47: 317,
 59: 286,
 42: 277,
 88: 267,
 33: 249,
 38: 174,
 3

In [43]:
# exclude from count all ascii characters, ie. all keys above 128
fake_char_counts_special = {k: v for k, v in fake_char_counts_sorted.items() if ord(k) > 128}
fake_char_counts_special

{'’': 4527,
 '“': 2540,
 '”': 1908,
 'é': 85,
 '£': 82,
 '‘': 65,
 '—': 62,
 '–': 51,
 '•': 46,
 'ñ': 23,
 '½': 21,
 'á': 17,
 '€': 15,
 'í': 14,
 'ü': 13,
 '»': 10,
 'ö': 10,
 'ó': 10,
 '°': 9,
 '…': 8,
 'ã': 8,
 '\u200b': 8,
 'è': 7,
 'ı': 6,
 '■': 6,
 'İ': 4,
 'ï': 4,
 'ä': 4,
 'ş': 3,
 '\xad': 3,
 'ğ': 3,
 'ú': 3,
 'å': 3,
 'ç': 3,
 '¥': 3,
 'ø': 3,
 'ë': 2,
 '®': 2,
 'Ş': 2,
 '年': 2,
 '月': 2,
 '日': 2,
 '′': 2,
 '×': 2,
 '·': 2,
 'â': 1,
 'ô': 1,
 'ń': 1,
 '⚠': 1,
 '️': 1,
 '″': 1,
 'ė': 1,
 'Ü': 1,
 'à': 1,
 'ð': 1,
 'Á': 1,
 'ê': 1,
 'É': 1,
 'Ó': 1,
 'Ø': 1,
 '♦': 1,
 'ř': 1}

In [46]:
# count different kind of apostrophes
count_apostrophe_type_1 = 0
count_apostrophe_type_2 = 0

# iterate over fake texts and count occurence of different apostrophes
for text in fake_texts_df["text"].apply(lambda x: x[0]):
    count_apostrophe_type_1 += text.count("'")
    count_apostrophe_type_2 += text.count("’")

print(f"Number of normal apostrophes: {count_apostrophe_type_1}")
print(f"Number of special apostrophes: {count_apostrophe_type_2}")

Number of normal apostrophes: 7100
Number of special apostrophes: 4527


In [36]:
# convert form unicode to character
chr(8212)

'—'

In [38]:
chr(8211)

'–'