## Populate the LosAngelesCovid Ontology with the crimes data

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
from datetime import datetime
import urllib.parse



In [10]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
activeBusinessesData = '../datasets/ACTIVE BUSINESSES/Listing_of_Active_Businesses_parsed.csv'
closedBusinessesData = '../datasets/CLOSED BUSINESSES/All_Closed_Businesses_20231101.csv'
laCovidData = '../datasets/COVID DATA/sorted_los_angeles_covid_data.csv'
crimeData1 = '../datasets/CRIME DATA/Crime_Data_from_2020_to_Present_1.csv'
crimeData2 = '../datasets/CRIME DATA/Crime_Data_from_2020_to_Present_2.csv'
crimeData3 = '../datasets/CRIME DATA/Crime_Data_from_2020_to_Present_3.csv'
crimeCodesDescData = '../datasets/CRIME DATA/CrimesCodesAndDesc_listed.csv'
moCodesData = '../datasets/CRIME DATA/MO_CODES_Numerical_20191119.csv'
weaponData = '../datasets/CRIME DATA/weapon_ds.csv'
premisCodesData = '../datasets/CRIME DATA/premisCodesData.csv'

# saving folder
savePath =  path

In [11]:
# Construct the country and the movie ontology namespaces not known by RDFlib
LAO = Namespace("http://www.bitsei.it/losAngelesOntology/")

## Crime Victims

In [12]:
# Load the CSV files in memory
laVictims1 = pd.read_csv(crimeData1, sep=',', usecols=['Vict Age', 'Vict Sex', 'Vict Descent']) 
laVictims2 = pd.read_csv(crimeData2, sep=',', usecols=['Vict Age', 'Vict Sex', 'Vict Descent'])
laVictims3 = pd.read_csv(crimeData3, sep=',', usecols=['Vict Age', 'Vict Sex', 'Vict Descent'])

print(laVictims1.head())
print(laVictims2.head())
print(laVictims3.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("lao", LAO)

   Vict Age Vict Sex Vict Descent
0        36        F            B
1        25        M            H
2         0        X            X
3        76        F            W
4        31        X            X
   Vict Age Vict Sex Vict Descent
0         0      NaN          NaN
1        35        F            H
2        62        F            B
3        33        F            B
4        58        M            B
   Vict Age Vict Sex Vict Descent
0        82        M            B
1        38        F            H
2        27        F            H
3         0      NaN          NaN
4        29        M            H


In [13]:
%%time 
#measure execution time

# Create a set to store unique combinations of sex, age, and descent
unique_entries = set()

#iterate over the laVictims1 dataframe
for index, row in laVictims1.iterrows():
    
    # Extract information from the current row
    victim_age = int(row['Vict Age'])
    if(victim_age < 0):
        victim_age = 0
    
    victim_sex = str(row['Vict Sex'])
    if(victim_sex == 'nan' or victim_sex == '-'):
        victim_sex = 'X'

    victim_descent = str(row['Vict Descent'])
    if(victim_descent == 'nan' or victim_descent == '-'):
        victim_descent = 'X'
    
    # Create the node to add to the Graph
    # the node has the namespace + the victim id as URI
    idU = "victim-"+str(victim_sex)+'-'+str(victim_age)+'-'+str(victim_descent)
    Victim = URIRef(LAO[idU])

    # If the current combination doesn't already exist
    if ((victim_sex, victim_age, victim_descent) not in unique_entries) and ((victim_age != 0) or (victim_sex != 'X') or (victim_descent != 'X')):
        # Add triples using store's add() method.
        g.add((Victim, RDF.type, LAO.Victim))
        g.add((Victim, LAO['victimSex'], Literal(str(victim_sex), datatype=XSD.string)))
        g.add((Victim, LAO['victimAge'], Literal((victim_age), datatype=XSD.int)))    
        g.add((Victim, LAO['victimDescent'], Literal(str(victim_descent), datatype=XSD.string)))    

        unique_entries.add((victim_sex, victim_age, victim_descent))
    
    
#iterate over the laVictims2 dataframe
for index, row in laVictims2.iterrows():
    
    # Extract information from the current row
    victim_age = int(row['Vict Age'])
    if(victim_age < 0):
        victim_age = 0
    
    victim_sex = str(row['Vict Sex'])
    if(victim_sex == 'nan'):
        victim_sex = 'X'

    victim_descent = str(row['Vict Descent'])
    if(victim_descent == 'nan'):
        victim_descent = 'X'
    
    # Create the node to add to the Graph
    # the node has the namespace + the victim id as URI
    idU = "victim-"+str(victim_sex)+'-'+str(victim_age)+'-'+str(victim_descent)
    Victim = URIRef(LAO[idU])

    # If the current combination doesn't already exist
    if ((victim_sex, victim_age, victim_descent) not in unique_entries) and ((victim_age != 0) or (victim_sex != 'X') or (victim_descent != 'X')):
        # Add triples using store's add() method.
        g.add((Victim, RDF.type, LAO.Victim))
        g.add((Victim, LAO['victimSex'], Literal(str(victim_sex), datatype=XSD.string)))
        g.add((Victim, LAO['victimAge'], Literal((victim_age), datatype=XSD.int)))    
        g.add((Victim, LAO['victimDescent'], Literal(str(victim_descent), datatype=XSD.string)))    

        unique_entries.add((victim_sex, victim_age, victim_descent))


#iterate over the laVictims3 dataframe
for index, row in laVictims3.iterrows():
    
    # Extract information from the current row
    victim_age = int(row['Vict Age'])
    if(victim_age < 0):
        victim_age = 0
    
    victim_sex = str(row['Vict Sex'])
    if(victim_sex == 'nan'):
        victim_sex = 'X'

    victim_descent = str(row['Vict Descent'])
    if(victim_descent == 'nan'):
        victim_descent = 'X'
    
    # Create the node to add to the Graph
    # the node has the namespace + the victim id as URI
    idU = "victim-"+str(victim_sex)+'-'+str(victim_age)+'-'+str(victim_descent)
    Victim = URIRef(LAO[idU])

    # If the current combination doesn't already exist
    if ((victim_sex, victim_age, victim_descent) not in unique_entries) and ((victim_age != 0) or (victim_sex != 'X') or (victim_descent != 'X')):
        # Add triples using store's add() method.
        g.add((Victim, RDF.type, LAO.Victim))
        g.add((Victim, LAO['victimSex'], Literal(str(victim_sex), datatype=XSD.string)))
        g.add((Victim, LAO['victimAge'], Literal((victim_age), datatype=XSD.int)))    
        g.add((Victim, LAO['victimDescent'], Literal(str(victim_descent), datatype=XSD.string)))    

        unique_entries.add((victim_sex, victim_age, victim_descent))

print("--- saving serialization ---")
with open('crimeVictims.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 55.1 s
Wall time: 1min 9s


## Modus Operandi

In [16]:
with open(moCodesData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(moCodesData, 'w') as file:
    file.writelines(modified_lines)

In [21]:
# Load the CSV files in memory
modusOperandi = pd.read_csv(moCodesData, sep='>')
print(modusOperandi.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Code          Description
0   100  Suspect Impersonate
1   101           Aid victim
2   102                Blind
3   103  Physically disabled
4   104             Customer


In [22]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in modusOperandi.iterrows():
    
    # Extract information from the current row
    print(f"{row['Code']} - {row['Description']}")

    moCode = int(row['Code'])
    moDesc = str(row['Description'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "modusOperandi"+str(moCode)
    Mo = URIRef(LAO[idU])

    g.add((Mo, RDF.type, LAO.ModusOperandi))
    g.add((Mo, LAO['moCode'], Literal((moCode), datatype=XSD.int)))    
    g.add((Mo, LAO['moDesc'], Literal(str(moDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimeModusOperandi.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

100 - Suspect Impersonate
101 - Aid victim
102 - Blind
103 - Physically disabled
104 - Customer
105 - Delivery
106 - Doctor
107 - God
108 - Infirm
109 - Inspector
110 - Involved in traffic/accident
112 - Police
113 - Renting
114 - Repair Person
115 - Returning stolen property
116 - Satan
117 - Salesman
118 - Seeking someone
119 - Sent by owner
120 - Social Security/Medicare
121 - DWP/Gas Company/Utility worker
122 - Contractor
123 - Gardener/Tree Trimmer
200 - Suspect wore disguise
201 - Bag
202 - Cap/hat
203 - Cloth (with eyeholes)
204 - Clothes of opposite sex
205 - Earring
206 - Gloves
207 - Handkerchief
208 - Halloween mask
209 - Mask
210 - Make up (males only)
211 - Shoes
212 - Nude/partly nude
213 - Ski mask
214 - Stocking
215 - Unusual clothes
216 - Suspect wore hood/hoodie
217 - Uniform
218 - Wig
219 - Mustache-Fake
220 - Suspect wore motorcycle helmet
301 - Escaped on (used) transit train
302 - Aimed gun
303 - Ambushed
304 - Ate/drank on premises
305 - Attacks from rear
306 - 

## Weapons

In [None]:
with open(weaponData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(weaponData, 'w') as file:
    file.writelines(modified_lines)

In [23]:
# Load the CSV files in memory
weapons = pd.read_csv(weaponData, sep='>')
print(weapons.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Weapon Used Cd                                     Weapon Desc
0             400  STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)
1             212                                          BOTTLE
2             102                                        HAND GUN
3             500                     UNKNOWN WEAPON/OTHER WEAPON
4             101                                        REVOLVER


In [24]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in weapons.iterrows():
    
    # Extract information from the current row
    print(f"{row['Weapon Used Cd']} - {row['Weapon Desc']}")

    weaponCode = int(row['Weapon Used Cd'])
    weaponDesc = str(row['Weapon Desc'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "weapon"+str(weaponCode)
    Weapon = URIRef(LAO[idU])

    g.add((Weapon, RDF.type, LAO.ModusOperandi))
    g.add((Weapon, LAO['weaponCode'], Literal((weaponCode), datatype=XSD.int)))    
    g.add((Weapon, LAO['weaponDesc'], Literal(str(weaponDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimeWeapons.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

400 - STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)
212 - BOTTLE
102 - HAND GUN
500 - UNKNOWN WEAPON/OTHER WEAPON
101 - REVOLVER
312 - PIPE/METAL PIPE
221 - GLASS
511 - VERBAL THREAT
200 - KNIFE WITH BLADE 6INCHES OR LESS
311 - HAMMER
307 - VEHICLE
501 - BOMB THREAT
304 - CLUB/BAT
512 - MACE/PEPPER SPRAY
510 - SCALDING LIQUID
219 - SCREWDRIVER
114 - AIR PISTOL/REVOLVER/RIFLE/BB GUN
201 - KNIFE WITH BLADE OVER 6 INCHES IN LENGTH
223 - UNKNOWN TYPE CUTTING INSTRUMENT
109 - SEMI-AUTOMATIC PISTOL
305 - FIXED OBJECT
204 - FOLDING KNIFE
106 - UNKNOWN FIREARM
503 - CAUSTIC CHEMICAL/POISON
205 - KITCHEN KNIFE
218 - OTHER CUTTING INSTRUMENT
306 - ROCK/THROWN OBJECT
113 - SIMULATED GUN
107 - OTHER FIREARM
515 - PHYSICAL PRESENCE
203 - DIRK/DAGGER
308 - STICK
103 - RIFLE
210 - RAZOR BLADE
207 - OTHER KNIFE
217 - SWORD
302 - BLUNT INSTRUMENT
215 - MACHETE
104 - SHOTGUN
211 - AXE
216 - SCISSORS
301 - BELT FLAILING INSTRUMENT/CHAIN
514 - TIRE IRON
509 - ROPE/LIGATURE
213 - CLEAVER
206 - SWITCH BLAD

## Premis Codes

In [25]:
with open(premisCodesData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(premisCodesData, 'w') as file:
    file.writelines(modified_lines)

In [28]:
# Load the CSV files in memory
premisCodes = pd.read_csv(premisCodesData, sep='>')
print(premisCodes.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Premis Cd                                   Premis Desc
0        221                                PUBLIC STORAGE
1        101                                        STREET
2        834          LA UNION STATION (NOT LINE SPECIFIC)
3        502  MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)
4        108                                   PARKING LOT


In [29]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in premisCodes.iterrows():
    
    # Extract information from the current row
    print(f"{row['Premis Cd']} - {row['Premis Desc']}")

    premisCode = int(row['Premis Cd'])
    premisDesc = str(row['Premis Desc'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "premis"+str(premisCode)
    Premis = URIRef(LAO[idU])

    g.add((Premis, RDF.type, LAO.Premis))
    g.add((Premis, LAO['premisCode'], Literal((premisCode), datatype=XSD.int)))    
    g.add((Premis, LAO['premisDesc'], Literal(str(premisDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimePremis.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

221 - PUBLIC STORAGE
101 - STREET
834 - LA UNION STATION (NOT LINE SPECIFIC)
502 - MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)
108 - PARKING LOT
102 - SIDEWALK
501 - SINGLE FAMILY DWELLING
735 - NIGHT CLUB (OPEN EVENINGS ONLY)
203 - OTHER BUSINESS
401 - MINI-MART
750 - CYBERSPACE
404 - DEPARTMENT STORE
602 - BANK
504 - OTHER RESIDENCE
519 - SHORT-TERM VACATION RENTAL
118 - CONSTRUCTION SITE
109 - PARK/PLAYGROUND
120 - STORAGE SHED
801 - MTA BUS
509 - MOBILE HOME/TRAILERS/CONSTRUCTION TRAILERS/RV'S/MOTORHOME
210 - RESTAURANT/FAST FOOD
710 - OTHER PREMISE
124 - BUS STOP
301 - GAS STATION
103 - ALLEY
518 - TRANSITIONAL HOUSING/HALFWAY HOUSE
122 - VEHICLE, PASSENGER/TRUCK
248 - CELL PHONE STORE
704 - ELEMENTARY SCHOOL
202 - LIQUOR STORE
117 - BEACH
146 - PATIO*
402 - MARKET
721 - HIGH SCHOOL
207 - BAR/COCKTAIL/NIGHTCLUB
725 - GOVERNMENT FACILITY (FEDERAL,STATE, COUNTY & CITY)
503 - HOTEL
605 - AUTOMATED TELLER MACHINE (ATM)
123 - PARKING UNDERGROUND/BUILDING
104 - DRIVEWAY
403 - DRUG STOR

In [None]:
# Load the CSV files in memory
clubs = pd.read_csv(clubsUrl, sep=',', index_col='club_id')
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [None]:
%%time 
#measure execution time

#iterate over the club dataframe
for index, row in clubs.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the club id as URI
    idU = "club"+str(index)
    Club = URIRef(SO[idU])
    # Add triples using store's add() method.
    g.add((Club, RDF.type, SO.Club))
    g.add((Club, SO['name'], Literal(row['name'], datatype=XSD.string)))
    idL = "league"+str(row['league_id'])
    g.add((Club, SO['competeIn'], URIRef(SO[idL])))
    
    try:
        # get the nationality of the club
        nationality = leagues.loc[row['league_id'], 'nationality' ]
        # create the RDF node
        Country = URIRef(CNS[row['nationality']])
        # add the edge connecting the Movie and the Country 
        g.add((Club, SO['nationality'], Country))    
    except KeyError:
        continue

    
    

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'clubs.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Players

In [None]:
# Load the CSV files in memory
players = pd.read_csv(playersUrl, sep=',', index_col='player_id', keep_default_na=False, na_values=['_'])
playersFifa = pd.read_csv(playersFifaUrl, sep=',', index_col='sofifa_id', keep_default_na=False, na_values=['_'])

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [None]:
#load the country codes
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])


In [None]:
from difflib import SequenceMatcher
import numpy as np

import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

import re

In [None]:
%%time 
#measure execution time

#iterate over the players dataframe
for index, row in players.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the player id as URI
    idU = "player"+str(index)
    Player = URIRef(SO[idU])
    # the transferMarkt profile has as URI, the URL of the profile in the website
    TransfermarktProfile = URIRef(row['url'])
    
    # Add triples using store's add() method.
    g.add((Player, RDF.type, SO.Player))
    g.add((TransfermarktProfile, RDF.type, SO.TransfermarktProfile))
    g.add((TransfermarktProfile, SO['isAbout'], Player))
    
    #process player name
    name = row['name'].split('-')

    if (len(name)>1):
        g.add((Player, SO['firstName'], Literal(name[0], datatype=XSD.string)))
        g.add((Player, SO['lastName'], Literal(name[1], datatype=XSD.string)))
    else:
        g.add((Player, SO['lastName'], Literal(name[0], datatype=XSD.string)))
        
    #there can be more than one position per player
    for pos in row['position'].split(' - '):
        g.add((Player, SO['position'], Literal(pos.lower(), datatype=XSD.string)))
    
    if not(row['club_id']==''):
        idC = "club"+str(row['club_id'])
        g.add((Player, SO['playFor'], URIRef(SO[idC])))

#iterate over the fifa dataframe
for index, row in playersFifa.iterrows():
    pname = row['short_name'].lower()
    if ('.' in pname):
        # get last name
        # in the fifa dataset we have short names as L. Messi so we delete the L. 
        # we need to check if the last name contains a space
        pname = row['short_name'].split('.')[1].lower().strip()
        if ' ' in pname:
            i = 0
            for t in pname.split(' '):
                if i == 0:
                    pname = t.lower()
                else:
                    pname = pname + "-" + t.lower()
                i += 1           
    elif(' ' in pname):
        # here we have to handle Cristiano Ronaldo mapping it to cristiano-ronaldo to maximize the match in the players dataframe 
        i = 0
        for t in row['short_name'].split(' '):
            if i == 0:
                pname = t.lower()
            else:
                pname = pname + "-" + t.lower()
            i += 1
    pname = strip_accents(pname)
    
    # find sim with the full name 
    fullname = row['long_name'].lower()
    i = 0
    for t in fullname.split(' '):
        if i == 0:
            fullname = t.lower()
        else:
            fullname = fullname + "-" + t.lower()
        i += 1 
    fullname = strip_accents(fullname)
    # check the players with that last name
    names =  players[players['name'].str.contains(pname)]['name']
    #find max similarity    
    maxN = 0
    playerId = ''
    for n in names:
        sim = SequenceMatcher(None, fullname, n).ratio()
        if (maxN < sim):
            maxN = sim
            playerId = players.loc[players['name'] == n].index[0]
        
    #if we get a valid playerId we can connect the Fifa stats to the transfermrkt player
    if (playerId != ''):
        #remove the row from the player dataframe to avoid futher matchings (we know data will contain errors)
        players = players.drop(index=playerId)
        idU = "player"+str(playerId)
        Player = URIRef(SO[idU])
        g.add((Player, SO['overallFifaValue'], Literal(row['overall'], datatype=XSD.int)))
        g.add((Player, SO['growthFifaPotential'], Literal(row['potential'], datatype=XSD.int)))
        g.add((Player, SO['economicValue'], Literal(row['value_eur'], datatype=XSD.int)))
        g.add((Player, SO['annualWage'], Literal(row['wage_eur'], datatype=XSD.int))) 
        
        pFeatures = str(row['player_tags'])
        if pFeatures != '_' and pFeatures != '':
            pFeatures = pFeatures.split(',')
            for feature in pFeatures:
                feature = feature.strip()
                feature = re.sub('#', '', feature)
                g.add((Player, SO['playerFeature'], Literal(feature, datatype=XSD.string)))
        
        if row['contract_valid_until'] != '_' and row['contract_valid_until'] != '':
            g.add((Player, SO['contractValidTo'], Literal(int(row['contract_valid_until']), datatype=XSD.gYear)))        

        g.add((Player, SO['birthday'], Literal(row['dob'], datatype=XSD.date)))
        g.add((Player, SO['height'], Literal(row['height_cm'], datatype=XSD.int)))
        g.add((Player, SO['weight'], Literal(row['weight_kg'], datatype=XSD.int)))
        
        
        nationality = row['nationality'] 
        nationality = nationality.replace(" ", "_")
        # create the RDF node
        Country = URIRef(CNS[nationality])
        # add the edge connecting the Movie and the Country 
        g.add((Player, SO['nationality'], Country))   

        # Homework: extend the code to populate the 'propertyOf' edge
        

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'players.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Games

In [None]:
# Load the CSV files in memory
apps = pd.read_csv(appearancesUrl, sep=',', index_col='appearance_id', keep_default_na=False, na_values=['_'])
games = pd.read_csv(gamesUrl, sep=',', index_col='game_id', keep_default_na=False, na_values=['_'])

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [None]:
#iterate over the games dataframe
for index, row in games.iterrows():
    # we use the transfermrket URL as URI
    Game = URIRef(row['url'])
    g.add((Game, RDF.type, SO.Game))
    idU1 = "club"+str(row['home_club_id'])
    idU2 = "club"+str(row['away_club_id'])
    HomeClub = URIRef(SO[idU1])
    AwayClub = URIRef(SO[idU2])
    g.add((Game, SO['homeClub'], HomeClub))
    g.add((Game, SO['awayClub'], AwayClub))    
    g.add((Game, SO['matchDay'], Literal(row['date'], datatype=XSD.date)))
    g.add((Game, SO['homeClubGoals'], Literal(row['home_club_goals'], datatype=XSD.int)))
    g.add((Game, SO['awayClubGoals'], Literal(row['away_club_goals'], datatype=XSD.int)))

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'games.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

In [None]:
# reload the full players dataframe
players = pd.read_csv(playersUrl, sep=',', index_col='player_id', keep_default_na=False, na_values=['_'])
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [None]:
oldgameid = ''
for index, row in apps.iterrows():
    idA = "appearance"+str(index)
    idP = "player"+str(row['player_id'])
    idG = "game"+currgameid
    Appearance = URIRef(SO[idA])
    Player = URIRef(SO[idP])
    currgameid = str(row['game_id'])
    Game = URIRef(SO[idG])
    g.add((Appearance, RDF.type, SO.Appearance))
    g.add((Player, SO['appearIn'], Appearance))
    g.add((Appearance, SO['playIn'], Game))

    g.add((Appearance, SO['goals'], Literal(row['goals'], datatype=XSD.int)))
    g.add((Appearance, SO['assists'], Literal(row['assists'], datatype=XSD.int)))
    g.add((Appearance, SO['minutesPlayed'], Literal(row['minutes_played'], datatype=XSD.int)))
    g.add((Appearance, SO['yellowCard'], Literal(row['yellow_cards'], datatype=XSD.int)))
    g.add((Appearance, SO['redCard'], Literal(row['red_cards'], datatype=XSD.int)))

    #add this triple only once per game
    if (currgameid != oldgameid):
        idL = "league"+str(row['league_id'])
        g.add((Game, SO['belongTo'], URIRef(SO[idL])))
        oldgameid = currgameid

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'appearances.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))