In [7]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

In [8]:
test_IDs = pd.read_csv("../../../test_data/test_ids.csv", header=None)[0].values

In [9]:
test_IDs.shape

(1680,)

### SPARQL queries

In [33]:
old_base_query = """
	PREFIX itsrdf:<https://www.w3.org/2005/11/its/rdf#>
    PREFIX schema:<http://schema.org/>
    PREFIX dbr:<http://dbpedia.org/resource/> 
    SELECT ?claim ?author ?inst ?text ?date ?ratval ?url  
    WHERE { 
		    {
			    SELECT ?claim ?author ?inst ?text ?date ?ratval ?url 
			    WHERE {
			    ?review a schema:ClaimReview .
            	?review schema:url ?url .
			    ?review schema:reviewRating ?rating .
			    ?rating schema:alternateName ?ratval .
			    ?review schema:itemReviewed ?claim .
			    ?claim schema:text ?text .
			    ?claim schema:author ?oauthor .
			    ?oauthor schema:name ?author .
   			    ?review schema:author ?revauthor .
                ?claim schema:datePublished ?date .
                ?revauthor schema:name ?inst FILTER regex(?ratval , "(^FALSE|TRUE|OTHER)") 
			    } ORDER BY ?claim 
		    }
	    }  
	LIMIT 10000 OFFSET """

In [38]:
base_query = """
	PREFIX itsrdf:<https://www.w3.org/2005/11/its/rdf#>
    PREFIX schema:<http://schema.org/>
    PREFIX dbr:<http://dbpedia.org/resource/> 
    SELECT ?claim ?text ?date ?ratval 
    WHERE { 
		    {
			    SELECT ?claim ?text ?date ?ratval 
			    WHERE {
			    ?review a schema:ClaimReview .
			    ?review schema:reviewRating ?rating .
			    ?rating schema:alternateName ?ratval .
			    ?review schema:itemReviewed ?claim .
			    ?claim schema:text ?text .
                ?claim schema:datePublished ?date .
                FILTER regex(?ratval , "(^FALSE|TRUE|OTHER)") 
			    } ORDER BY ?claim 
		    }
	    }  
	LIMIT 10000 OFFSET """

In [11]:
test_query = """
	PREFIX itsrdf:<https://www.w3.org/2005/11/its/rdf#>
    PREFIX schema:<http://schema.org/>
    PREFIX dbr:<http://dbpedia.org/resource/> 
    SELECT ?claim ?text 
    WHERE{
		    {
			    SELECT ?claim ?text 
			    WHERE {
			    ?claim a schema:CreativeWork .
                ?claim schema:text ?text .
			    } 
		    } FILTER regex(?claim ,
"""

### Query data for training

In [39]:

sparql = SPARQLWrapper("https://data.gesis.org/claimskg/sparql")
query_results = []

for i in range(6): # max 60k claims
    offset = i * 10000
    query = base_query + str(offset)
    
    sparql.setQuery(query)
    
    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()

    results = pd.json_normalize(results["results"]["bindings"])
    
    res = pd.DataFrame(results)
    
    query_results.append(res)

In [40]:
claims = pd.concat(query_results, axis= 0)

In [42]:
claims.shape
# 20957 with some duplicates for old_base_query
# 21209 with some duplicates for base_query

(21209, 11)

### Query data for testing

In [43]:
# runs approximately 10min
sparql = SPARQLWrapper("https://data.gesis.org/claimskg/sparql")
test_query_results = []

for c in tqdm(test_IDs):
    
    query = test_query + f"\"{c}\"" + ")}"
    
    sparql.setQuery(query)
    
    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()

    results = pd.json_normalize(results["results"]["bindings"])
    
    res = pd.DataFrame(results)
    
    test_query_results.append(res)

100%|██████████| 1680/1680 [09:47<00:00,  2.86it/s]


In [44]:
test_queries = pd.concat(test_query_results, axis=0)

In [45]:
test_queries.shape

(1680, 5)

### Some reshaping

In [46]:
claims = claims[["claim.value", "text.value", "date.value", "ratval.value"]]

In [48]:
claims = claims.rename(columns={"claim.value" : "ID",
                        "ratval.value" : "truth_rating",
                        "text.value" : "claim",
                        "date.value":"date"})

In [49]:
claims.head()

Unnamed: 0,ID,claim,date,truth_rating
0,http://data.gesis.org/claimskg/creative_work/0...,Actual video of Iraqi soldier saying goodbye t...,2021-07-07,FALSE
1,http://data.gesis.org/claimskg/creative_work/0...,Bus launched in August 2020 in Pakistan falls ...,2020-08-25,FALSE
2,http://data.gesis.org/claimskg/creative_work/0...,Another man was responsible for the assault th...,2018-09-28,OTHER
3,http://data.gesis.org/claimskg/creative_work/0...,U.S. President Joe Biden visited the Tree of L...,2021-09-03,FALSE
4,http://data.gesis.org/claimskg/creative_work/0...,Turkey legs sold in Disney's theme are actuall...,2017-03-13,FALSE


In [50]:
test_queries = test_queries[["claim.value", "text.value"]]

In [51]:
test_queries = test_queries.rename(columns={"claim.value" : "ID", "text.value" : "claim"})

In [52]:
test_queries.head()

Unnamed: 0,ID,claim
0,http://data.gesis.org/claimskg/creative_work/d...,A TikTok video shows a March 2022 school walko...
0,http://data.gesis.org/claimskg/creative_work/3...,Mattel sent Barbie dolls to the International ...
0,http://data.gesis.org/claimskg/creative_work/2...,'No one visiting Disney can get in” because of...
0,http://data.gesis.org/claimskg/creative_work/f...,Two years ago we were “drilling our own oil fo...
0,http://data.gesis.org/claimskg/creative_work/5...,"Families could suffer a £2,000-a-year average ..."


### Remove all test queries from claims (ensure they are not used for training)

In [54]:
claims.shape

(21209, 4)

In [58]:
claims = claims.loc[~claims.ID.isin(test_IDs)]

### Save the queried data

In [59]:
claims.reset_index(drop=True).to_csv("../../../data/raw_claims.csv")

In [60]:
test_queries.reset_index(drop=True).to_csv("../../../data/raw_test_claims.csv")