## Populate the Loa Angeles Ontology

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
from datetime import datetime
import urllib



In [2]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
activeBusinessesData = 'datasets/ACTIVE BUSINESSES/Listing_of_Active_Businesses_parsed.csv'
closedBusinessesData = 'datasets/CLOSED BUSINESSES/All_Closed_Businesses_20231101.csv'
laCovidData = 'datasets/COVID DATA/sorted_los_angeles_covid_data.csv'
crimeData1 = 'datasets/CRIME DATA/Crime_Data_from_2020_to_Present_1.csv'
crimeData2 = 'datasets/CRIME DATA/Crime_Data_from_2020_to_Present_2.csv'
crimeData3 = 'datasets/CRIME DATA/Crime_Data_from_2020_to_Present_3.csv'
crimeCodesDescData = 'datasets/CRIME DATA/CrimesCodesAndDesc_listed.csv'
moCodesData = 'datasets/CRIME DATA/CrimesCodesAndDesc_listed.csv'
naicsData = 'datasets/CLOSED BUSINESSES/2022_NAICS_Descriptions.csv'
weaponData = 'datasets/CRIME DATA/weapon_ds.csv'


# saving folder
savePath =  path

In [3]:
# Construct the country and the movie ontology namespaces not known by RDFlib
#CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
LAO = Namespace("http://www.bitsei.it/losAngelesOntology/")



## COVID Days

In [4]:
# Load the CSV files in memory
laCovid = pd.read_csv(laCovidData, sep=',') #, dtype={'Active':int, 'Deaths':int}

laCovid["Last_Update"] = pd.to_datetime(laCovid['Last_Update'])
laCovid['solodata'] = laCovid['Last_Update'].dt.date
laCovid['Active'] = laCovid['Active'].astype('Int64')
laCovid['Deaths'] = laCovid['Deaths'].astype('Int64')

laCovid.set_index("solodata", inplace=True)

print(laCovid.head())


#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("lao", LAO)

              FIPS       Admin2 Province_State Country_Region  \
solodata                                                        
2020-02-04  6037.0  Los Angeles     California             US   
2020-03-22  6037.0  Los Angeles     California             US   
2020-03-23  6037.0  Los Angeles     California             US   
2020-03-24  6037.0  Los Angeles     California             US   
2020-03-25  6037.0  Los Angeles     California             US   

                   Last_Update        Lat       Long_  Confirmed  Deaths  \
solodata                                                                   
2020-02-04 2020-02-04 23:25:00  34.308284 -118.228241       4045      78   
2020-03-22 2020-03-22 23:45:00  34.308284 -118.228241        407       5   
2020-03-23 2020-03-23 23:19:34  34.308284 -118.228241        536       7   
2020-03-24 2020-03-24 23:37:31  34.308284 -118.228241        662      11   
2020-03-25 2020-03-25 23:33:19  34.308284 -118.228241        812      13   

           

In [5]:
%%time 
#measure execution time

#iterate over the league dataframe
for index, row in laCovid.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the league id as URI
    idU = str(index)
    Day = URIRef(LAO[idU])
    # Add triples using store's add() method.
    g.add((Day, RDF.type, LAO.Day))
    g.add((Day, LAO['hasDate'], Literal(str(row['Last_Update']), datatype=XSD.datetime)))    
    g.add((Day, LAO['hasActiveCases'], Literal(row['Active'], datatype=XSD.int)))    
    g.add((Day, LAO['hasNOfDeaths'], Literal(row['Deaths'], datatype=XSD.int)))    
    # create the RDF node
    # Country = URIRef(CNS[row['nationality']])
    # add the edge connecting the Movie and the Country 
    #g.add((League, SO['nationality'], Country))    
print("--- saving serialization ---")
with open('covidDays.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 93.8 ms
Wall time: 195 ms




## Cities

In [6]:
# Load the CSV files in memory
cities1 = pd.read_csv(closedBusinessesData, sep=',', index_col='LOCATION ACCOUNT #')
cities1 = cities1[['CITY','ZIP CODE']]


cities2 = pd.read_csv(activeBusinessesData, sep=',', index_col='LOCATION ACCOUNT #')
cities2 = cities2[['CITY','ZIP CODE']]


cities = pd.merge(cities1, cities2)
cities["ZIP CODE"] = cities["ZIP CODE"].str.split("-", expand=True).get(0)
cities = cities[cities['ZIP CODE'] != '']
cities = cities.drop_duplicates()
cities.set_index("ZIP CODE", inplace=True)

print(cities.head(100))
print(len(cities))

cities.to_csv('cities.csv', index=True)


#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("lao", LAO)

                    CITY
ZIP CODE                
91344      GRANADA HILLS
90015        LOS ANGELES
91306           WINNETKA
90020        LOS ANGELES
91303        CANOGA PARK
...                  ...
90292     MARINA DEL REY
90710        HARBOR CITY
90247            GARDENA
91436             ENCINO
91411           VAN NUYS

[100 rows x 1 columns]
763


In [7]:
%%time 
#measure execution time

#iterate over the league dataframe
for index, row in cities.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the league id as URI
    idU = str(index)
    City = URIRef(LAO[idU])
    # Add triples using store's add() method.
    g.add((City, RDF.type, LAO.City))
    g.add((City, LAO['cityZipCode'], Literal(str(index), datatype=XSD.string)))    
    g.add((City, LAO['cityName'], Literal(row['CITY'], datatype=XSD.string)))    
    # create the RDF node
    # Country = URIRef(CNS[row['nationality']])
    # add the edge connecting the Movie and the Country 
    #g.add((League, SO['nationality'], Country))    
print("--- saving serialization ---")
with open('cities.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 94.4 ms


## Naics

In [8]:
# Load the CSV files in memory
naics = pd.read_csv(naicsData, sep=',', index_col='Code')
naics = naics[['Title']]
naics["Title"] = naics["Title"].replace("T$", "", regex=True)
print(naics.head(20))
print(len(naics))
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("lao", LAO)


                                             Title
Code                                              
11      Agriculture, Forestry, Fishing and Hunting
111                                Crop Production
1111                     Oilseed and Grain Farming
11111                              Soybean Farming
111110                             Soybean Farming
11112             Oilseed (except Soybean) Farming
111120            Oilseed (except Soybean) Farming
11113                     Dry Pea and Bean Farming
111130                    Dry Pea and Bean Farming
11114                                Wheat Farming
111140                               Wheat Farming
11115                                 Corn Farming
111150                                Corn Farming
11116                                 Rice Farming
111160                                Rice Farming
11119                          Other Grain Farming
111191       Oilseed and Grain Combination Farming
111199                     All 

In [9]:
%%time 
#measure execution time

#iterate over the league dataframe
for index, row in naics.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the league id as URI
    idU = index
    Naics = URIRef(LAO[idU])
    # Add triples using store's add() method.
    g.add((Naics, RDF.type, LAO.Naics))
    g.add((Naics, LAO['naicsCode'], Literal(index, datatype=XSD.string)))    
    g.add((Naics, LAO['naicsDescription'], Literal(row['Title'], datatype=XSD.string)))    
    # create the RDF node
    # Country = URIRef(CNS[row['nationality']])
    # add the edge connecting the Movie and the Country 
    #g.add((League, SO['nationality'], Country))    
print("--- saving serialization ---")
with open('naics.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 93.8 ms
Wall time: 259 ms


## Active Businesses

In [11]:
# Load the CSV files in memory
activeBusinesses = pd.read_csv(activeBusinessesData, sep=',', index_col='LOCATION ACCOUNT #')

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("lao", LAO)


In [13]:
%%time 
#measure execution time

activeBusinesses["FILTERED ZIP"] = activeBusinesses["ZIP CODE"].str.split("-", expand=True).get(0)
activeBusinesses["NAICS"] = activeBusinesses["NAICS"].astype("Int64")
#iterate over the league dataframe
for index, row in activeBusinesses.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the league id as URI
    idU = str(index)
    Business = URIRef(LAO[idU])
    # Add triples using store's add() method.

    #TYPE
    g.add((Business, RDF.type, LAO.Business))
    
    #DATA PROPERTIES
    g.add((Business, LAO['businessId'], Literal(str(index), datatype=XSD.string)))    
    g.add((Business, LAO['businessName'], Literal(row['BUSINESS NAME'], datatype=XSD.string)))    
    g.add((Business, LAO['doingBusinessName'], Literal(row['DBA NAME'], datatype=XSD.string)))

    #OBJECT PROPERTIES
    if (row["LOCATION START DATE"] != ''):
        g.add((Business, LAO['openedOnDate'], LAO[row['LOCATION START DATE']]))
    if (row["FILTERED ZIP"] != ''):
        g.add((Business, LAO['locatedInCity'], LAO[urllib.parse.quote(row["FILTERED ZIP"])]))
    if (pd.isna(row["NAICS"])):
        g.add((Business, LAO['hasNaics'], LAO[row['NAICS']]))
        
    
    

    # create the RDF node
    # Country = URIRef(CNS[row['nationality']])
    # add the edge connecting the Movie and the Country 
    #g.add((League, SO['nationality'], Country))    
print("--- saving serialization ---")
with open('activeBusinesses.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 10.1 s
Wall time: 31.5 s


## Clubs

In [64]:
# Load the CSV files in memory
clubs = pd.read_csv(clubsUrl, sep=',', index_col='club_id')
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

NameError: name 'clubsUrl' is not defined

In [269]:
%%time 
#measure execution time

#iterate over the club dataframe
for index, row in clubs.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the club id as URI
    idU = "club"+str(index)
    Club = URIRef(SO[idU])
    # Add triples using store's add() method.
    g.add((Club, RDF.type, SO.Club))
    g.add((Club, SO['name'], Literal(row['name'], datatype=XSD.string)))
    idL = "league"+str(row['league_id'])
    g.add((Club, SO['competeIn'], URIRef(SO[idL])))
    
    try:
        # get the nationality of the club
        nationality = leagues.loc[row['league_id'], 'nationality' ]
        # create the RDF node
        Country = URIRef(CNS[row['nationality']])
        # add the edge connecting the Movie and the Country 
        g.add((Club, SO['nationality'], Country))    
    except KeyError:
        continue

    
    

CPU times: user 96.4 ms, sys: 3.49 ms, total: 99.9 ms
Wall time: 102 ms


In [270]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'clubs.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

--- saving serialization ---
CPU times: user 74 ms, sys: 3.26 ms, total: 77.3 ms
Wall time: 78.1 ms


## Players

In [20]:
# Load the CSV files in memory
players = pd.read_csv(playersUrl, sep=',', index_col='player_id', keep_default_na=False, na_values=['_'])
playersFifa = pd.read_csv(playersFifaUrl, sep=',', index_col='sofifa_id', keep_default_na=False, na_values=['_'])

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [21]:
#load the country codes
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])


In [22]:
from difflib import SequenceMatcher
import numpy as np

import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

import re

In [None]:
%%time 
#measure execution time

#iterate over the players dataframe
for index, row in players.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the player id as URI
    idU = "player"+str(index)
    Player = URIRef(SO[idU])
    # the transferMarkt profile has as URI, the URL of the profile in the website
    TransfermarktProfile = URIRef(row['url'])
    
    # Add triples using store's add() method.
    g.add((Player, RDF.type, SO.Player))
    g.add((TransfermarktProfile, RDF.type, SO.TransfermarktProfile))
    g.add((TransfermarktProfile, SO['isAbout'], Player))
    
    #process player name
    name = row['name'].split('-')

    if (len(name)>1):
        g.add((Player, SO['firstName'], Literal(name[0], datatype=XSD.string)))
        g.add((Player, SO['lastName'], Literal(name[1], datatype=XSD.string)))
    else:
        g.add((Player, SO['lastName'], Literal(name[0], datatype=XSD.string)))
        
    #there can be more than one position per player
    for pos in row['position'].split(' - '):
        g.add((Player, SO['position'], Literal(pos.lower(), datatype=XSD.string)))
    
    if not(row['club_id']==''):
        idC = "club"+str(row['club_id'])
        g.add((Player, SO['playFor'], URIRef(SO[idC])))

#iterate over the fifa dataframe
for index, row in playersFifa.iterrows():
    pname = row['short_name'].lower()
    if ('.' in pname):
        # get last name
        # in the fifa dataset we have short names as L. Messi so we delete the L. 
        # we need to check if the last name contains a space
        pname = row['short_name'].split('.')[1].lower().strip()
        if ' ' in pname:
            i = 0
            for t in pname.split(' '):
                if i == 0:
                    pname = t.lower()
                else:
                    pname = pname + "-" + t.lower()
                i += 1           
    elif(' ' in pname):
        # here we have to handle Cristiano Ronaldo mapping it to cristiano-ronaldo to maximize the match in the players dataframe 
        i = 0
        for t in row['short_name'].split(' '):
            if i == 0:
                pname = t.lower()
            else:
                pname = pname + "-" + t.lower()
            i += 1
    pname = strip_accents(pname)
    
    # find sim with the full name 
    fullname = row['long_name'].lower()
    i = 0
    for t in fullname.split(' '):
        if i == 0:
            fullname = t.lower()
        else:
            fullname = fullname + "-" + t.lower()
        i += 1 
    fullname = strip_accents(fullname)
    # check the players with that last name
    names =  players[players['name'].str.contains(pname)]['name']
    #find max similarity    
    maxN = 0
    playerId = ''
    for n in names:
        sim = SequenceMatcher(None, fullname, n).ratio()
        if (maxN < sim):
            maxN = sim
            playerId = players.loc[players['name'] == n].index[0]
        
    #if we get a valid playerId we can connect the Fifa stats to the transfermrkt player
    if (playerId != ''):
        #remove the row from the player dataframe to avoid futher matchings (we know data will contain errors)
        players = players.drop(index=playerId)
        idU = "player"+str(playerId)
        Player = URIRef(SO[idU])
        g.add((Player, SO['overallFifaValue'], Literal(row['overall'], datatype=XSD.int)))
        g.add((Player, SO['growthFifaPotential'], Literal(row['potential'], datatype=XSD.int)))
        g.add((Player, SO['economicValue'], Literal(row['value_eur'], datatype=XSD.int)))
        g.add((Player, SO['annualWage'], Literal(row['wage_eur'], datatype=XSD.int))) 
        
        pFeatures = str(row['player_tags'])
        if pFeatures != '_' and pFeatures != '':
            pFeatures = pFeatures.split(',')
            for feature in pFeatures:
                feature = feature.strip()
                feature = re.sub('#', '', feature)
                g.add((Player, SO['playerFeature'], Literal(feature, datatype=XSD.string)))
        
        if row['contract_valid_until'] != '_' and row['contract_valid_until'] != '':
            g.add((Player, SO['contractValidTo'], Literal(int(row['contract_valid_until']), datatype=XSD.gYear)))        

        g.add((Player, SO['birthday'], Literal(row['dob'], datatype=XSD.date)))
        g.add((Player, SO['height'], Literal(row['height_cm'], datatype=XSD.int)))
        g.add((Player, SO['weight'], Literal(row['weight_kg'], datatype=XSD.int)))
        
        
        nationality = row['nationality'] 
        nationality = nationality.replace(" ", "_")
        # create the RDF node
        Country = URIRef(CNS[nationality])
        # add the edge connecting the Movie and the Country 
        g.add((Player, SO['nationality'], Country))   

        # Homework: extend the code to populate the 'propertyOf' edge
        

In [None]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'players.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

## Games

In [276]:
# Load the CSV files in memory
apps = pd.read_csv(appearancesUrl, sep=',', index_col='appearance_id', keep_default_na=False, na_values=['_'])
games = pd.read_csv(gamesUrl, sep=',', index_col='game_id', keep_default_na=False, na_values=['_'])

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [277]:
#iterate over the games dataframe
for index, row in games.iterrows():
    # we use the transfermrket URL as URI
    Game = URIRef(row['url'])
    g.add((Game, RDF.type, SO.Game))
    idU1 = "club"+str(row['home_club_id'])
    idU2 = "club"+str(row['away_club_id'])
    HomeClub = URIRef(SO[idU1])
    AwayClub = URIRef(SO[idU2])
    g.add((Game, SO['homeClub'], HomeClub))
    g.add((Game, SO['awayClub'], AwayClub))    
    g.add((Game, SO['matchDay'], Literal(row['date'], datatype=XSD.date)))
    g.add((Game, SO['homeClubGoals'], Literal(row['home_club_goals'], datatype=XSD.int)))
    g.add((Game, SO['awayClubGoals'], Literal(row['away_club_goals'], datatype=XSD.int)))

In [278]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'games.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

--- saving serialization ---
CPU times: user 4.7 s, sys: 55.8 ms, total: 4.75 s
Wall time: 4.81 s


In [279]:
# reload the full players dataframe
players = pd.read_csv(playersUrl, sep=',', index_col='player_id', keep_default_na=False, na_values=['_'])
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

In [280]:
oldgameid = ''
for index, row in apps.iterrows():
    idA = "appearance"+str(index)
    idP = "player"+str(row['player_id'])
    idG = "game"+currgameid
    Appearance = URIRef(SO[idA])
    Player = URIRef(SO[idP])
    currgameid = str(row['game_id'])
    Game = URIRef(SO[idG])
    g.add((Appearance, RDF.type, SO.Appearance))
    g.add((Player, SO['appearIn'], Appearance))
    g.add((Appearance, SO['playIn'], Game))

    g.add((Appearance, SO['goals'], Literal(row['goals'], datatype=XSD.int)))
    g.add((Appearance, SO['assists'], Literal(row['assists'], datatype=XSD.int)))
    g.add((Appearance, SO['minutesPlayed'], Literal(row['minutes_played'], datatype=XSD.int)))
    g.add((Appearance, SO['yellowCard'], Literal(row['yellow_cards'], datatype=XSD.int)))
    g.add((Appearance, SO['redCard'], Literal(row['red_cards'], datatype=XSD.int)))

    #add this triple only once per game
    if (currgameid != oldgameid):
        idL = "league"+str(row['league_id'])
        g.add((Game, SO['belongTo'], URIRef(SO[idL])))
        oldgameid = currgameid

In [281]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'appearances.ttl', 'w') as file:
    file.write(g.serialize(format='turtle').decode("utf-8"))

--- saving serialization ---
CPU times: user 1min 22s, sys: 1.32 s, total: 1min 23s
Wall time: 1min 27s
