In [1]:
import json , rdflib , hashlib , requests , csv
from rdflib import URIRef, Literal, Namespace, Graph, XSD
from rdflib.namespace import RDF , RDFS, DC
from rdflib.serializer import Serializer
from datetime import datetime
import os.path
from langdetect import detect

In [2]:
with open('rdf_transform/context.json') as context_arco:
    context_doc = json.load(context_arco)
    
def rdf_format(serializ):
    frmt = "xml" if serializ == "xml"\
            else "n3" if serializ == "xml"\
            else "ttl" if serializ == "turtle"\
            else "nt" if serializ == "nt"\
            else "xml" if serializ == "pretty-xml"\
            else "json" if serializ == "json-ld"\
            else "xml"
    return frmt

def unique_id(text):
    return hashlib.md5(text.encode()).hexdigest()

def get_emotion(txt, filename, lang='it'):
    """CALL CELI emotion annotation APIs"""
    try:
        lang = detect(txt) if detect(txt) and detect(txt) == 'en' else 'it'
    except Exception as e:
        pass
    try:
        data = '{"content": "'+txt+'"}'
        annotations = requests.post('https://sophia-cluster-dev.aws.celi.it/'+lang+'/spice/analysis', auth=('live', 'analys1s0042'), data=data.encode('utf-8'))
        if annotations.status_code == 200:
            annotations_json = annotations.json()
            with open('emotions/'+filename+'.json', 'w') as f:
                json.dump(annotations_json, f)  
            print(txt, annotations_json) 
            return annotations_json
    except Exception as e:
        print(txt, e)
        pass

# Instagram

In [3]:
def instagram_to_rdf(input_json_file, 
                    output_rdf_file, 
                    serialization_format,
                    prefix,
                    social_media,
                    post_id,
                    date= None,
                    img_url = None,
                    text=None,
                    likes=None,
                    author=None,
                    artefact_id=None,
                    artefact_prefix=None
                   ):
    """ 
    Transform a JSON file about social media posts into RDF according to the Schema.org ontology. 
    
    Parameters
    ----------
    
    input_json_file: str . 
        The path to the JSON file. It must be a list of dictionaries. 
        Every dictionary must represent a social media post.
    output_rdf_file: str . 
        The path and name of the output RDF file, format excluded.
    serialization_format: str . 
        The RDF serialization and file format. 
        Choose between: xml, n3, turtle, nt, pretty-xml, json-ld
    prefix: str .
        The short name to be associated to the dataset. 
        Must be lower case, no spaces and special characters.
    social_media: str.
        The short name to be associated to the social media platform.
        Values must be: intagram [todo: add more platforms]
    post_id: str.
        The name of the key including the post short ID.
        The value of the key must be a string.
    date: str.
        The name of the key including the timestamp of the post.
        The value must be a timestamp.
    img_url: str.
        The name of the key including the URL of an image associated to the post.
        The value must be a URL.
    text: str.
        The name of the key including the text associated to the post.
        The value must be a string.
    likes: str.
        The name of the key including the number of likes associated to the post.
        The value must be a string or an integer.
    author: str.
        The name of the key including the anonymised identifier of the user that created the post.
        The value must be a string.
    artefact_id: str.
        The name of the key including the identifier of the artefact described in the post.
        The value must be a string.
    artefact_prefix: str.
        The partial URI to be associated to the artefact ID. 
        The value must be a dereferencable URI.
    """
    
    valid = {"instagram","twitter"}
    if social_media not in valid:
        raise ValueError("results: social_media must be one of %r." % valid)
    
    sm_prefixes = {"instagram": "https://www.instagram.com/p/"} 
    
    g = Graph()
    
    # namespaces
    spice = "https://w3id.org/spice/"
    base = spice + prefix + '/'
    schema = Namespace("http://schema.org/")
    earmark = Namespace("http://www.essepuntato.it/2008/12/earmark#")
    ARCOCD = Namespace("https://w3id.org/arco/ontology/denotative-description/")
    semiotics = Namespace("http://ontologydesignpatterns.org/cp/owl/semiotics.owl#")
    EMOTION = Namespace("https://w3id.org/spice/SON/emotion/")
    g.bind( prefix , base)
    g.bind( "dc" , DC)
    g.bind( "arco-cd" , ARCOCD)
    g.bind( "earmark" , earmark)
    g.bind( "schema" , schema)
    g.bind('xsd', XSD)
    g.bind( "semiotics" , semiotics)
    g.bind("emo", EMOTION)
    
    # parse JSON
    with open(input_json_file) as json_file:
        data = json.load(json_file)
        
        for post in data:
            if artefact_id in post:
                post_URI = spice+'instagram_post/'+post[post_id]
                date_val = datetime.fromtimestamp(int(post[date])).strftime("%Y-%m-%dT%H:%M:%SZ") if (date in post and len(post[date]) > 0) else None
                img = post[img_url] if (img_url in post and len(post[img_url]) > 0 ) else None
                txt = post[text] if (text in post and len(post[text]) > 0 ) else None
                likes_num = int(post[likes]) if (likes in post and len(post[likes]) > 0 ) else None
                author_uri = base+'instagram_user/'+post[author] if (author in post and len(post[author]) > 0 ) else None
                artefact = post[artefact_id].replace(" ", "") if (artefact_id and artefact_id in post and len(post[artefact_id]) > 0 ) else None
                artefact_uri = artefact_prefix+artefact if artefact_prefix else base+artefact

                # add triples
                g.add(( URIRef(post_URI), RDF.type , schema.SocialMediaPosting ))
                g.add(( URIRef(post_URI), schema.url , URIRef(sm_prefixes[social_media]+post[post_id]) ))
                
                g.add(( URIRef(post_URI), schema.about , URIRef(artefact_uri) ))
                    
                if date_val:
                    g.add(( URIRef(post_URI), schema.datePublished , Literal(date_val, datatype=XSD.dateTime) ))
                if img:
                    g.add(( URIRef(post_URI), schema.sharedContent , URIRef(img) ))
                
                    # TODO add CELI
                    text_post = post_URI+'/text'
                    g.add(( URIRef(post_URI), schema.sharedContent , URIRef(text_post) ))
                    g.add(( URIRef(text_post), RDF.type , earmark.StringDocuverse ))
                if txt:
                    g.add(( URIRef(text_post), earmark.hasContent , Literal(txt) ))
                    g.add(( URIRef(artefact_uri), ARCOCD.hasSubject , URIRef(text_post) ))

                    # CALL CELI emotion annotation APIs
                    if os.path.isfile('emotions/ig_'+artefact+'_'+post[post_id]+'.json'):
                        print("it exists: ",'emotions/ig_'+artefact+'_'+post[post_id]+'.json')
                        with open('emotions/ig_'+artefact+'_'+post[post_id]+'.json') as f:
                            annotations_json = json.load(f)
                    #else:
                    #    annotations_json = get_emotion(txt, 'ig_'+artefact+'_'+post[post_id]) 
                      
                    if annotations_json and len(annotations_json["@graph"])> 1:
                        for ann in annotations_json["@graph"]:
                            l = 0
                            if "@type" in ann \
                                and ann["@type"] == "earmark:PointerRange" \
                                and "semiotics:denotes" in ann \
                                and "@type" in ann["semiotics:denotes"] \
                                and "emotion:" in ann["semiotics:denotes"]["@type"]:
                                cur_emotion = ann["semiotics:denotes"]["@id"][3:]
                                cur_emotion_type = ann["semiotics:denotes"]["@type"].split(":")[1]
                                l += 1
                                g.add(( URIRef(text_post+'/'+'/pointer_range_'+str(l)), earmark.refersTo , URIRef(text_post) ))
                                g.add(( URIRef(text_post+'/'+'/pointer_range_'+str(l)), semiotics.denotes , URIRef(text_post+'/'+'/pointer_range_'+str(l)+'/'+cur_emotion) ))
                                g.add(( URIRef(text_post+'/'+'/pointer_range_'+str(l)+'/'+cur_emotion), RDF.type , URIRef(EMOTION+cur_emotion_type) ))
                                g.add(( URIRef(artefact_uri+'/'+post[post_id]+'/emotion_relation'), EMOTION.emotion, URIRef(text_post+'/'+'/pointer_range_'+str(l)+'/'+cur_emotion) ))
                                g.add(( URIRef(artefact_uri), EMOTION.triggers, URIRef(text_post+'/'+'/pointer_range_'+str(l)+'/'+cur_emotion) ))             
                
                        
                if likes_num:
                    g.add(( URIRef(post_URI), schema.interactionStatistic , URIRef(base+'likes_counter/'+post[post_id]) ))
                    g.add(( URIRef(base+'likes_counter/'+post[post_id]), RDF.type , schema.InteractionCounter ))
                    g.add(( URIRef(base+'likes_counter/'+post[post_id]), schema.userInteractionCount , Literal(likes_num, datatype=XSD.integer) ))
                if author_uri:
                    g.add(( URIRef(post_URI), schema.author , URIRef(author_uri) ))
                
    if serialization_format == 'json-ld':
        g.serialize(destination=output_rdf_file+'.'+rdf_format(serialization_format), format=serialization_format, context=context_doc, encoding='utf-8')
    else:
        g.serialize(destination=output_rdf_file+'.'+rdf_format(serialization_format), format=serialization_format, encoding='utf-8')
        

### GAM instagram to Schema

In [4]:
#exec
instagram_to_rdf(input_json_file="2068445810127772.json", 
                    output_rdf_file="rdf_transform/GAM_test_instagram", 
                    serialization_format="json-ld",
                    prefix="gam",
                    social_media="instagram",
                    post_id="shortcode",
                    date="timestamp",
                    img_url="img_url",
                    text ="text",
                    likes="likes",
                    author="creator",
                    artefact_id="artefact",
                    artefact_prefix="https://w3id.org/spice/gam/artefact/"
                   )

it exists:  emotions/ig_42_CLhAYryJtRx.json
it exists:  emotions/ig_45_CLgq3VHprrd.json
it exists:  emotions/ig_38_CLgqH97pwBD.json
it exists:  emotions/ig_38_CLSFebTDZXC.json
it exists:  emotions/ig_30_CLRES8ZFH_c.json
it exists:  emotions/ig_38_CK8ulp7jOws.json
it exists:  emotions/ig_62_CK3GupeBsq-.json
it exists:  emotions/ig_38_CK1GxgYjKoV.json
it exists:  emotions/ig_38_CJb4oV6rAqK.json
it exists:  emotions/ig_38_CIn7ghQAxLo.json
it exists:  emotions/ig_47_CHtJMIGgYDU.json
it exists:  emotions/ig_37_CG7DtEMpisk.json
it exists:  emotions/ig_38_CGxtnBHgLAU.json
it exists:  emotions/ig_37_CF4WGQSDNYR.json
it exists:  emotions/ig_62_CF1ZY-jiaYW.json
it exists:  emotions/ig_41_CFpiv5phvEH.json
it exists:  emotions/ig_38_CEg9HQAqgtZ.json
it exists:  emotions/ig_38_CD9fNVdAD42.json
it exists:  emotions/ig_38_CC3vsP_q7ac.json
it exists:  emotions/ig_59_CC1JGvro3RJ.json
it exists:  emotions/ig_38_CCSw5sHoTVM.json
it exists:  emotions/ig_45_B-oZoxzqRAy.json
it exists:  emotions/ig_45_B9v8N

In [None]:
st = """RT @emilnolde1: "@francesco1263: Demetrio #Cosola Il dettato, 1891 #pastellosutela #GAMC #Torino #artweet #Art http://t.co/oVSz6hod44"""
st ="Mi piace la Mona Lisa"
res = get_emotion(st, 'tw_test', 'it') 
print(res)

# Twitter

Currently tweaked to work with GAM data only (TODO move same code in the `else` condition)

In [5]:
def twitter_to_rdf(input_json_files, 
                    output_rdf_file, 
                    serialization_format,
                    prefix
                   ):
    """ 
    Transform a JSON file about social media posts into RDF according to the Schema.org ontology. 
    
    Parameters
    ----------
    
    input_json_files: str . 
        a comma separated list of JSON files. JSON files are results of Twitter Academic API calls. 
        Every dictionary must represent a social media post.
    output_rdf_file: str . 
        The path and name of the output RDF file, format excluded.
    serialization_format: str . 
        The RDF serialization and file format. 
        Choose between: xml, n3, turtle, nt, pretty-xml, json-ld
    prefix: str .
        The short name to be associated to the dataset. 
        Must be lower case, no spaces and special characters.
    """
    g = Graph()
    
    # namespaces
    spice = "https://w3id.org/spice/"
    base = spice + prefix + '/'
    artefact_prefix = base+"artefact/"
    schema = Namespace("http://schema.org/")
    earmark = Namespace("http://www.essepuntato.it/2008/12/earmark#")
    ARCOCD = Namespace("https://w3id.org/arco/ontology/denotative-description/")
    semiotics = Namespace("http://ontologydesignpatterns.org/cp/owl/semiotics.owl#")
    EMOTION = Namespace("https://w3id.org/spice/SON/emotion/")
    g.bind( prefix , base)
    g.bind( "dc" , DC)
    g.bind( "schema" , schema)
    g.bind( "earmark" , earmark)
    g.bind('xsd', XSD)
    g.bind( "arco-cd" , ARCOCD)
    g.bind( "semiotics" , semiotics)
    g.bind("emo", EMOTION)
    
    
    tweet_ids = {}
    if prefix == 'gam':
        # select only the tweets that have been matched to an artefact
        with open("GAM_test_catalogue.json") as gam_file:
            gam_data = json.load(gam_file)
            for artefact in gam_data:
                if "twitter" in artefact and len(artefact["twitter"]) > 0:
                    for tweet in artefact["twitter"]:
                        tweet_id = tweet.split('/')[-1]
                        tweet_ids[tweet_id] = artefact["ID"]
        
        # get date and author from another file created later (because I forgot to request authors and dates of tweets)
        author_dates = {}
        with open("GAM_tweets_additional_info.json") as gamm_file:
            info = json.load(gamm_file)   
            for group_dict in info:
                if "data" in group_dict:
                    for post_dict in group_dict["data"]:
                        author_dates[post_dict["id"]] = [post_dict["author_id"], post_dict["created_at"] ]
                       
                        
    # parse JSON
    for input_json_file in input_json_files.split(','):
        with open(input_json_file) as json_file:
            data = json.load(json_file)
            
            for group_dict in data:
                if "data" in group_dict:
                    for post_dict in group_dict["data"]:
                        if prefix == 'gam':
                            if post_dict["id"] in tweet_ids:
                                post_id = post_dict["id"]
                                post_URI = spice+'twitter_post/'+post_dict["id"]
                                
                                # get the list of images
                                imgs = [] 
                                if "attachments" in post_dict:
                                    for media_key in post_dict["attachments"]["media_keys"]: 
                                        for media in group_dict["includes"]["media"]:
                                            if media["media_key"] == media_key and media["type"] == 'photo':
                                                img_url = media["url"]
                                                imgs.append(img_url)
                                txt = post_dict["text"] if ("text" in post_dict and len(post_dict["text"]) > 0 ) else None
                                artefact_id = tweet_ids[post_dict["id"]]
                                artefact = artefact_prefix+artefact_id
                                text_post = post_URI+'/text'
                                date_val , likes_num , author_uri = None , None , None
                                
                                # add triples
                                
                                g.add(( URIRef(post_URI), RDF.type , schema.SocialMediaPosting ))
                                g.add(( URIRef(post_URI), schema.url , URIRef('https://twitter.com/whatever/status/'+post_dict["id"]) ))
                                g.add(( URIRef(post_URI), schema.about , URIRef(artefact) ))
                                
                                
                                if txt:
                                    g.add(( URIRef(post_URI), schema.sharedContent , URIRef(text_post) ))
                                    g.add(( URIRef(text_post), RDF.type , earmark.StringDocuverse ))
                                    g.add(( URIRef(text_post), earmark.hasContent , Literal(txt) ))
                                    g.add(( URIRef(artefact), ARCOCD.hasSubject , URIRef(text_post) ))

                                    # CALL CELI emotion annotation APIs
                                    if os.path.isfile('emotions/tw_'+artefact_id+'_'+post_id+'.json'):
                                        print("it exists: ",'emotions/tw_'+artefact_id+'_'+post_id+'.json')
                                        with open('emotions/tw_'+artefact_id+'_'+post_id+'.json') as f:
                                            annotations_json = json.load(f)
                                    #else:
                                    #    annotations_json = get_emotion(txt.replace("\n",' ').replace("\r",' ').replace('"', '-'), 'tw_'+artefact_id+'_'+post_id) 
                                         
                                    
                                    if annotations_json and len(annotations_json["@graph"])> 1:
                                        for ann in annotations_json["@graph"]:
                                            l = 0
                                            if "@type" in ann \
                                                and ann["@type"] == "earmark:PointerRange" \
                                                and "semiotics:denotes" in ann \
                                                and "@type" in ann["semiotics:denotes"] \
                                                and "emotion:" in ann["semiotics:denotes"]["@type"]:
                                                cur_emotion = ann["semiotics:denotes"]["@id"][3:]
                                                cur_emotion_type = ann["semiotics:denotes"]["@type"].split(":")[1]
                                                l += 1
                                                g.add(( URIRef(text_post+'/pointer_range_'+str(l)), earmark.refersTo , URIRef(text_post) ))
                                                g.add(( URIRef(text_post+'/pointer_range_'+str(l)), semiotics.denotes , URIRef(text_post+'/pointer_range_'+str(l)+'/'+cur_emotion) ))
                                                g.add(( URIRef(text_post+'/pointer_range_'+str(l)+'/'+cur_emotion), RDF.type , URIRef(EMOTION+cur_emotion_type) ))
                                                g.add(( URIRef(artefact+'/'+post_id+'/emotion_relation'), EMOTION.emotion, URIRef(text_post+'/pointer_range_'+str(l)+'/'+cur_emotion) ))
                                                g.add(( URIRef(artefact), EMOTION.triggers, URIRef(text_post+'/pointer_range_'+str(l)+'/'+cur_emotion) ))             

                                if imgs and len(imgs) > 0:
                                    for img in imgs:
                                        g.add(( URIRef(post_URI), schema.sharedContent , URIRef(img) ))

                                if likes_num:
                                    g.add(( URIRef(post_URI), schema.interactionStatistic , URIRef(base+'likes_counter/'+post[post_id]) ))
                                    g.add(( URIRef(base+'likes_counter/'+post[post_id]), RDF.type , schema.InteractionCounter ))
                                    g.add(( URIRef(base+'likes_counter/'+post[post_id]), schema.userInteractionCount , Literal(likes_num, datatype=XSD.integer) ))
                                
                                date_val = date_val if date_val else author_dates[post_dict["id"]][1] if post_dict["id"] in author_dates else None
                                if date_val:
                                    g.add(( URIRef(post_URI), schema.datePublished , Literal(date_val, datatype=XSD.dateTime) ))
                                    
                                author_uri = author_uri if author_uri else base+'twitter_user/'+author_dates[post_dict["id"]][0] if post_dict["id"] in author_dates else None
                                if author_uri:
                                    g.add(( URIRef(post_URI), schema.author , URIRef(author_uri) ))


                        else:
                            pass
                            # do same stuff with other twitter files

    if serialization_format == 'json-ld':
        g.serialize(destination=output_rdf_file+'.'+rdf_format(serialization_format), format=serialization_format, context=context_doc, encoding='utf-8')
    else:
        g.serialize(destination=output_rdf_file+'.'+rdf_format(serialization_format), format=serialization_format, encoding='utf-8')
        

### GAM Twitter to Schema

In [6]:
#exec
twitter_to_rdf(input_json_files="GAM_tweets_hashtag.json,GAM_tweets_images.json,GAM_tweets_search_artefacts.json", 
                    output_rdf_file="rdf_transform/GAM_test_twitter", 
                    serialization_format="json-ld",
                    prefix="gam" 
                   )

it exists:  emotions/tw_36_1346811876798320640.json
it exists:  emotions/tw_55_1309515998853201920.json
it exists:  emotions/tw_49_1223562614183202816.json
it exists:  emotions/tw_55_1213953225516371968.json
it exists:  emotions/tw_55_1154983558815129600.json
it exists:  emotions/tw_38_1116665993227534336.json
it exists:  emotions/tw_30_1116268002469662720.json
it exists:  emotions/tw_39_1099978097284341760.json
it exists:  emotions/tw_67_1095637542823776256.json
it exists:  emotions/tw_33_1082589925797756928.json
it exists:  emotions/tw_49_1068508907482595328.json
it exists:  emotions/tw_30_1061988484821536768.json
it exists:  emotions/tw_47_1060086386685108224.json
it exists:  emotions/tw_27_1326632082391642114.json
it exists:  emotions/tw_27_1250860493125943296.json
it exists:  emotions/tw_27_958755412953698305.json
it exists:  emotions/tw_27_534771349785702402.json
it exists:  emotions/tw_27_494178350370652163.json
it exists:  emotions/tw_27_459464511812935680.json
it exists:  emot

it exists:  emotions/tw_47_1178431160973578240.json
it exists:  emotions/tw_47_1178431143445749764.json
it exists:  emotions/tw_47_1149933610910154752.json
it exists:  emotions/tw_47_1149928953827561472.json
it exists:  emotions/tw_47_1149792633881055232.json
it exists:  emotions/tw_47_1134178527950057474.json
it exists:  emotions/tw_47_1134166140752674824.json
it exists:  emotions/tw_47_1132258726151434240.json
it exists:  emotions/tw_47_1131750487181733889.json
it exists:  emotions/tw_47_1131724968885215233.json
it exists:  emotions/tw_47_1131724541502349312.json
it exists:  emotions/tw_47_1131720631559413760.json
it exists:  emotions/tw_47_1131709932129202176.json
it exists:  emotions/tw_47_1131706886607331334.json
it exists:  emotions/tw_47_1131706359899197442.json
it exists:  emotions/tw_47_1108389819468693507.json
it exists:  emotions/tw_47_1097215464533569536.json
it exists:  emotions/tw_47_1093616682910785542.json
it exists:  emotions/tw_47_1087379666569027586.json
it exists:  

it exists:  emotions/tw_30_432436553231900672.json
it exists:  emotions/tw_30_118402784352673793.json
it exists:  emotions/tw_33_657163175230636032.json
it exists:  emotions/tw_38_461964493099388928.json


### Utils

In [None]:
# because I forgot to request dates and authors
def connect_to_tweets_endpoint(tweets_ids, bearer_token):
    
    def chunks(lst, n):
        """Yield successive n-sized chunks from lst."""
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
    
    lists_ids = chunks(tweets_ids, 100) 
    results = []
    
    for ids in lists_ids:
        ids = ','.join(ids)
        tweet_fields = "&tweet.fields=created_at&expansions=author_id"
        url = "https://api.twitter.com/2/tweets?ids={}&{}".format(ids, tweet_fields) 
        headers = {"Authorization": "Bearer {}".format(bearer_token)}
        response = requests.request("GET", url, headers=headers)
        print(response.status_code)
        if response.status_code != 200:
            raise Exception(
                "Request returned an error: {} {}".format(
                    response.status_code, response.text
                )
            )
        result = response.json()
        print(result)
        results.append(result)
    
    with open('GAM_tweets_additional_info.json', 'w', encoding='utf-8') as outfile:
        json.dump(results, outfile, ensure_ascii=False, indent=1)
    
    return results

tweet_ids = {}
with open("GAM_test_catalogue.json") as gam_file:
    gam_data = json.load(gam_file)
    for artefact in gam_data:
        if "twitter" in artefact and len(artefact["twitter"]) > 0:
            for tweet in artefact["twitter"]:
                tweet_id = tweet.split('/')[-1]
                tweet_ids[tweet_id] = artefact["ID"]

bearer_token = "AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAA4ptPlod1lFf2iMg52Ezq7bMjuMo%3DGiUV3BzNiU11Z5Bexhhqiar5bFICGMpMBiIPU9KWQf3Mhnbna7"

list_ids = [*tweet_ids]
#dates_authors = connect_to_tweets_endpoint(list_ids, bearer_token)