In [2]:
import pandas as pd
import numpy as np
from deep_translator import GoogleTranslator
from tqdm import tqdm

In [3]:
claims = pd.read_csv("../../../data/preprocessed_claims.csv", index_col=0)
claims.shape

(18897, 5)

Apply augmentation only to training data but not to val or test data.

In [4]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"

In [4]:
claims[train_idx].truth_rating.value_counts()

FALSE    9887
OTHER    4312
TRUE     3003
Name: truth_rating, dtype: int64

To balance we need to double the "OTHER" claims and triple the "TRUE" claims.

In [5]:
training_claims = claims[train_idx]

In [6]:
true_claims = training_claims[training_claims.truth_rating == "TRUE"]

In [7]:
true_claims

Unnamed: 0,ID,claim,date,truth_rating,n_token
13,http://data.gesis.org/claimskg/creative_work/0...,The producers of the Broadway musical 'Hamilto...,2016-11-21,TRUE,16
14,http://data.gesis.org/claimskg/creative_work/0...,Coca-Cola recalled an advertising poster due t...,2004-07-28,TRUE,14
15,http://data.gesis.org/claimskg/creative_work/0...,The number of votes held in the nine months si...,2018-03-15,TRUE,26
20,http://data.gesis.org/claimskg/creative_work/0...,A newborn child declared dead revived after be...,2013-06-06,TRUE,13
28,http://data.gesis.org/claimskg/creative_work/0...,The government is cutting the number of hospit...,2017-02-22,TRUE,9
...,...,...,...,...,...
18763,http://data.gesis.org/claimskg/creative_work/e...,A TikTok video shows a Starbucks order being d...,2021-07-26,TRUE,11
18804,http://data.gesis.org/claimskg/creative_work/e...,'Iron glasses belonging to a Yemeni child sold...,2020-05-24,TRUE,13
18843,http://data.gesis.org/claimskg/creative_work/f...,'The death of the artist Youssef Shaaban - tru...,2021-03-01,TRUE,10
18844,http://data.gesis.org/claimskg/creative_work/f...,Lac Vert in Soultzeren contains cyanobacteria ...,2020-07-20,TRUE,12


In [8]:
tc = true_claims.claim.values

In [6]:
en_to_zh = GoogleTranslator(source='en', target='zh-CN')
zh_to_en = GoogleTranslator(source='zh-CN', target='en')

In [20]:
translations_for_true_claims = [zh_to_en.translate(en_to_zh.translate(c)) for c in tqdm(tc)]

100%|██████████| 3075/3075 [1:02:13<00:00,  1.21s/it]


In [21]:
#with open("translations_for_true_claim_zh", "wb") as f:
#   np.save(f, np.array(translations_for_true_claims))

In [10]:
en_to_hi = GoogleTranslator(source='en', target='hi')
hi_to_en = GoogleTranslator(source='hi', target='en')

In [11]:
translations_for_true_claims_hi = [hi_to_en.translate(en_to_hi.translate(c)) for c in tqdm(tc)]

100%|██████████| 3003/3003 [1:03:11<00:00,  1.26s/it]


In [12]:
with open("translations_for_true_claims_hi", "wb") as f:
    np.save(f, np.array(translations_for_true_claims_hi))

In [7]:
other_claims = training_claims[training_claims.truth_rating == "OTHER"]

In [8]:
oc = other_claims.claim.values
oc.shape

(4312,)

In [9]:
translations_for_other_claims_zh = [zh_to_en.translate(en_to_zh.translate(c)) for c in tqdm(oc)]

 78%|███████▊  | 3366/4312 [2:00:03<37:14,  2.36s/it]  

In [None]:
with open("translations_for_other_claims_zh", "wb") as f:
    np.save(f, np.array(translations_for_other_claims_zh))

### Combine the augmented data with the original data

In [13]:
tr_1 = training_claims[training_claims.truth_rating == "TRUE"].copy()

In [14]:
# first the en -> zh -> en translations of the true claims
true_trans_1 = np.load("translations_for_true_claims_zh")
true_trans_1.shape

(3003,)

In [15]:
tr_1.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
13,http://data.gesis.org/claimskg/creative_work/0...,The producers of the Broadway musical 'Hamilto...,2016-11-21,True,16
14,http://data.gesis.org/claimskg/creative_work/0...,Coca-Cola recalled an advertising poster due t...,2004-07-28,True,14
15,http://data.gesis.org/claimskg/creative_work/0...,The number of votes held in the nine months si...,2018-03-15,True,26
20,http://data.gesis.org/claimskg/creative_work/0...,A newborn child declared dead revived after be...,2013-06-06,True,13
28,http://data.gesis.org/claimskg/creative_work/0...,The government is cutting the number of hospit...,2017-02-22,True,9


In [16]:
tr_1["claim"] = true_trans_1

In [18]:
tr_1.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
13,http://data.gesis.org/claimskg/creative_work/0...,"Producers of the Broadway musical ""Hamilton"" h...",2016-11-21,True,16
14,http://data.gesis.org/claimskg/creative_work/0...,Coca-Cola has recalled an advertising poster b...,2004-07-28,True,14
15,http://data.gesis.org/claimskg/creative_work/0...,The turnout in the nine months since the elect...,2018-03-15,True,26
20,http://data.gesis.org/claimskg/creative_work/0...,A newborn baby who was declared dead has been ...,2013-06-06,True,13
28,http://data.gesis.org/claimskg/creative_work/0...,The government is cutting the number of hospit...,2017-02-22,True,9


In [19]:
# now for the en -> hi -> en translations
true_trans_2 = np.load("translations_for_true_claims_hi")
true_trans_2.shape

(3003,)

In [20]:
tr_2 = training_claims[training_claims.truth_rating == "TRUE"].copy()

In [21]:
tr_2.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
13,http://data.gesis.org/claimskg/creative_work/0...,The producers of the Broadway musical 'Hamilto...,2016-11-21,True,16
14,http://data.gesis.org/claimskg/creative_work/0...,Coca-Cola recalled an advertising poster due t...,2004-07-28,True,14
15,http://data.gesis.org/claimskg/creative_work/0...,The number of votes held in the nine months si...,2018-03-15,True,26
20,http://data.gesis.org/claimskg/creative_work/0...,A newborn child declared dead revived after be...,2013-06-06,True,13
28,http://data.gesis.org/claimskg/creative_work/0...,The government is cutting the number of hospit...,2017-02-22,True,9


In [22]:
tr_2["claim"] = true_trans_2

In [23]:
tr_2.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
13,http://data.gesis.org/claimskg/creative_work/0...,The producers of the Broadway musical 'Hamilto...,2016-11-21,True,16
14,http://data.gesis.org/claimskg/creative_work/0...,Coca-Cola has recalled an advertising poster b...,2004-07-28,True,14
15,http://data.gesis.org/claimskg/creative_work/0...,The number of votes polled in the nine months ...,2018-03-15,True,26
20,http://data.gesis.org/claimskg/creative_work/0...,A newborn baby who was declared dead came to l...,2013-06-06,True,13
28,http://data.gesis.org/claimskg/creative_work/0...,The government is reducing the number of beds ...,2017-02-22,True,9


In [24]:
# load for other claims from en -> zh -> en
other_trans = np.load("translations_for_other_claims_zh_1")
other_trans.shape

(4312,)

In [25]:
ot = training_claims[training_claims.truth_rating == "OTHER"].copy()
ot.shape

(4312, 5)

In [26]:
ot.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
2,http://data.gesis.org/claimskg/creative_work/0...,Another man was responsible for the assault th...,2018-09-28,OTHER,19
5,http://data.gesis.org/claimskg/creative_work/0...,Nigel Farage once flirted with the idea of the...,2018-03-02,OTHER,14
6,http://data.gesis.org/claimskg/creative_work/0...,The government has given the NHS the money tha...,2017-09-29,OTHER,12
11,http://data.gesis.org/claimskg/creative_work/0...,The EU wants the UK to pay £60 billion before ...,2017-01-20,OTHER,16
22,http://data.gesis.org/claimskg/creative_work/0...,Businesses are celebrating their anniversaries...,2016-02-08,OTHER,19


In [27]:
ot["claim"] = other_trans

In [28]:
ot.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
2,http://data.gesis.org/claimskg/creative_work/0...,Another man is responsible for an attack that ...,2018-09-28,OTHER,19
5,http://data.gesis.org/claimskg/creative_work/0...,Nigel Farage once dismissed the idea of ​​a se...,2018-03-02,OTHER,14
6,http://data.gesis.org/claimskg/creative_work/0...,The government has provided the NHS with the f...,2017-09-29,OTHER,12
11,http://data.gesis.org/claimskg/creative_work/0...,The EU wants Britain to pay £60bn before negot...,2017-01-20,OTHER,16
22,http://data.gesis.org/claimskg/creative_work/0...,Businesses celebrate their anniversaries by gi...,2016-02-08,OTHER,19


### Combine all together into single df

In [29]:
training_claims

Unnamed: 0,ID,claim,date,truth_rating,n_token
0,http://data.gesis.org/claimskg/creative_work/0...,Actual video of Iraqi soldier saying goodbye t...,2021-07-07,FALSE,10
1,http://data.gesis.org/claimskg/creative_work/0...,Bus launched in August 2020 in Pakistan falls ...,2020-08-25,FALSE,11
2,http://data.gesis.org/claimskg/creative_work/0...,Another man was responsible for the assault th...,2018-09-28,OTHER,19
3,http://data.gesis.org/claimskg/creative_work/0...,U.S. President Joe Biden visited the Tree of L...,2021-09-03,FALSE,16
4,http://data.gesis.org/claimskg/creative_work/0...,Turkey legs sold in Disney's theme are actuall...,2017-03-13,FALSE,10
...,...,...,...,...,...
18892,http://data.gesis.org/claimskg/creative_work/f...,The boss of BioNTech does not want to be vacci...,2021-09-20,OTHER,10
18893,http://data.gesis.org/claimskg/creative_work/f...,opening dates of airports around the world by ...,2020-06-11,OTHER,14
18894,http://data.gesis.org/claimskg/creative_work/f...,A Togolese witch doctor was arrested on July 7...,2020-08-03,OTHER,27
18895,http://data.gesis.org/claimskg/creative_work/f...,The mask loses 20% of oxygen in the blood and ...,2020-09-04,OTHER,20


In [34]:
augmented_claims = pd.concat([training_claims, tr_1, tr_2, ot], axis=0)

Now remove duplicates and recount the n_token

In [36]:
augmented_claims.shape

(27520, 5)

In [38]:
augmented_claims = augmented_claims.drop_duplicates(subset="claim")

In [39]:
def count_words(claim):
        tokens = claim.split(" ")
        return len(tokens)

In [40]:
augmented_claims["n_token"] = augmented_claims["claim"].apply(count_words)

In [41]:
augmented_claims

Unnamed: 0,ID,claim,date,truth_rating,n_token
0,http://data.gesis.org/claimskg/creative_work/0...,Actual video of Iraqi soldier saying goodbye t...,2021-07-07,FALSE,10
1,http://data.gesis.org/claimskg/creative_work/0...,Bus launched in August 2020 in Pakistan falls ...,2020-08-25,FALSE,11
2,http://data.gesis.org/claimskg/creative_work/0...,Another man was responsible for the assault th...,2018-09-28,OTHER,19
3,http://data.gesis.org/claimskg/creative_work/0...,U.S. President Joe Biden visited the Tree of L...,2021-09-03,FALSE,16
4,http://data.gesis.org/claimskg/creative_work/0...,Turkey legs sold in Disney's theme are actuall...,2017-03-13,FALSE,10
...,...,...,...,...,...
18892,http://data.gesis.org/claimskg/creative_work/f...,BioNTech boss doesn't want a vaccine,2021-09-20,OTHER,6
18893,http://data.gesis.org/claimskg/creative_work/f...,Airport Opening Dates Announced by the Interna...,2020-06-11,OTHER,14
18894,http://data.gesis.org/claimskg/creative_work/f...,"On July 7, a Togolese witch doctor was arreste...",2020-08-03,OTHER,26
18895,http://data.gesis.org/claimskg/creative_work/f...,As carbon dioxide is reinfused with each inhal...,2020-09-04,OTHER,21


In [42]:
# finally some shuffle 
augmented_claims.sample(frac=1.0, random_state=417, replace=False)

Unnamed: 0,ID,claim,date,truth_rating,n_token
16347,http://data.gesis.org/claimskg/creative_work/e...,Paper is stuck to President Donald Trump's sho...,2018-10-05,TRUE,21
13016,http://data.gesis.org/claimskg/creative_work/b...,Melania Trump once posted a photo of herself w...,2020-07-17,FALSE,22
7672,http://data.gesis.org/claimskg/creative_work/7...,Image of bomb blast in Pakistan's Lahore in 2021,2021-09-24,FALSE,9
9143,http://data.gesis.org/claimskg/creative_work/8...,The former Florida representative and 49ers qu...,2016-08-31,TRUE,18
16326,http://data.gesis.org/claimskg/creative_work/e...,Image shows police fleeing attack on headquart...,2021-03-23,FALSE,10
...,...,...,...,...,...
3460,http://data.gesis.org/claimskg/creative_work/3...,The protective ghost of toddler who died at a ...,1998-12-07,OTHER,18
10991,http://data.gesis.org/claimskg/creative_work/a...,"""America has lost a giant,"" former US Vice Pre...",2021-02-17,TRUE,20
16170,http://data.gesis.org/claimskg/creative_work/e...,Drinking a mixture of milk and cough syrup is ...,2020-02-27,FALSE,10
14911,http://data.gesis.org/claimskg/creative_work/d...,Photograph shows a newborn baby clutching an I...,2017-05-03,FALSE,11


### Save augmented data

In [45]:
augmented_claims.reset_index(drop=True).to_csv("../../../data/augmented_claims.csv")