## Populate the LosAngelesCovid Ontology with the crimes data

In [2]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
from datetime import datetime
import urllib.parse



In [3]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

activeBusinessesData = '../datasets/ACTIVE BUSINESSES/Listing_of_Active_Businesses_parsed.csv'
closedBusinessesData = '../datasets/CLOSED BUSINESSES/All_Closed_Businesses_20231101.csv'
laCovidData = '../datasets/COVID DATA/sorted_los_angeles_covid_data.csv'

crimeData1 = '../datasets/CRIME DATA/Crime_Data_from_2020_to_Present_1.csv'
crimeData2 = '../datasets/CRIME DATA/Crime_Data_from_2020_to_Present_2.csv'
crimeData3 = '../datasets/CRIME DATA/Crime_Data_from_2020_to_Present_3.csv'

crimeCodesDescData = '../datasets/CRIME DATA/CrimesCodesAndDesc_listed.csv'
crimeCodesDescData_parsed = '../datasets/CRIME DATA/CrimesCodesAndDesc_listed_parsed.csv'

moCodesData = '../datasets/CRIME DATA/MO_CODES_Numerical_20191119.csv'
moCodesData_parsed = '../datasets/CRIME DATA/MO_CODES_Numerical_20191119_parsed.csv'

weaponData = '../datasets/CRIME DATA/weapon_ds.csv'
weaponData_parsed = '../datasets/CRIME DATA/weapon_ds_parsed.csv'

premisCodesData = '../datasets/CRIME DATA/premisCodesData.csv'
premisCodesData_parsed = '../datasets/CRIME DATA/premisCodesData_parsed.csv'

# saving folder
savePath =  path

In [4]:
# Construct the country and the movie ontology namespaces not known by RDFlib
LAO = Namespace("http://www.bitsei.it/losAngelesOntology/")

## Crime Victims

In [5]:
# Load the CSV files in memory
laVictims1 = pd.read_csv(crimeData1, sep=',', usecols=['Vict Age', 'Vict Sex', 'Vict Descent']) 
laVictims2 = pd.read_csv(crimeData2, sep=',', usecols=['Vict Age', 'Vict Sex', 'Vict Descent'])
laVictims3 = pd.read_csv(crimeData3, sep=',', usecols=['Vict Age', 'Vict Sex', 'Vict Descent'])

laVictims = pd.concat([laVictims1, laVictims2, laVictims3])

print(laVictims.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
#g.bind("countries", CNS)
g.bind("lao", LAO)

   Vict Age Vict Sex Vict Descent
0        36        F            B
1        25        M            H
2         0        X            X
3        76        F            W
4        31        X            X


In [6]:
%%time 
#measure execution time

# Create a set to store unique combinations of sex, age, and descent
unique_entries = set()

#iterate over the laVictims1 dataframe
for index, row in laVictims.iterrows():
    
    # Extract information from the current row
    victim_age = int(row['Vict Age'])
    if(victim_age < 0):
        victim_age = 0
    
    victim_sex = str(row['Vict Sex'])
    if(victim_sex == 'nan' or victim_sex == '-'):
        victim_sex = 'X'

    victim_descent = str(row['Vict Descent'])
    if(victim_descent == 'nan' or victim_descent == '-'):
        victim_descent = 'X'
    
    # Create the node to add to the Graph
    # the node has the namespace + the victim id as URI
    idU = "victim-"+str(victim_sex)+'-'+str(victim_age)+'-'+str(victim_descent)
    Victim = URIRef(LAO[idU])

    # If the current combination doesn't already exist
    if ((victim_sex, victim_age, victim_descent) not in unique_entries) and ((victim_age != 0) or (victim_sex != 'X') or (victim_descent != 'X')):
        # Add triples using store's add() method.
        g.add((Victim, RDF.type, LAO.Victim))
        g.add((Victim, LAO['victimSex'], Literal(str(victim_sex), datatype=XSD.string)))
        g.add((Victim, LAO['victimAge'], Literal((victim_age), datatype=XSD.int)))    
        g.add((Victim, LAO['victimDescent'], Literal(str(victim_descent), datatype=XSD.string)))    

        unique_entries.add((victim_sex, victim_age, victim_descent))
    

print("--- saving serialization ---")
with open('crimeVictims.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 5.23 s
Wall time: 18.4 s


## Modus Operandi

with open(moCodesData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(moCodesData_parsed, 'w') as file:
    file.writelines(modified_lines)

In [7]:
# Load the CSV files in memory
modusOperandi = pd.read_csv(moCodesData_parsed, sep='>')
print(modusOperandi.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Code          Description
0   100  Suspect Impersonate
1   101           Aid victim
2   102                Blind
3   103  Physically disabled
4   104             Customer


In [8]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in modusOperandi.iterrows():
    
    # Extract information from the current row
    #print(f"{row['Code']} - {row['Description']}")

    moCode = int(row['Code'])
    moDesc = str(row['Description'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "modusOperandi"+str(moCode)
    Mo = URIRef(LAO[idU])

    g.add((Mo, RDF.type, LAO.ModusOperandi))
    g.add((Mo, LAO['moCode'], Literal((moCode), datatype=XSD.int)))    
    g.add((Mo, LAO['moDesc'], Literal(str(moDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimeModusOperandi.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 130 ms


## Weapons

with open(weaponData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(weaponData_parsed, 'w') as file:
    file.writelines(modified_lines)

In [9]:
# Load the CSV files in memory
weapons = pd.read_csv(weaponData_parsed, sep='>')
print(weapons.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Weapon Used Cd                                     Weapon Desc
0             400  STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)
1             212                                          BOTTLE
2             102                                        HAND GUN
3             500                     UNKNOWN WEAPON/OTHER WEAPON
4             101                                        REVOLVER


In [10]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in weapons.iterrows():
    
    # Extract information from the current row
    #print(f"{row['Weapon Used Cd']} - {row['Weapon Desc']}")

    weaponCode = int(row['Weapon Used Cd'])
    weaponDesc = str(row['Weapon Desc'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "weapon"+str(weaponCode)
    Weapon = URIRef(LAO[idU])

    g.add((Weapon, RDF.type, LAO.ModusOperandi))
    g.add((Weapon, LAO['weaponCode'], Literal((weaponCode), datatype=XSD.int)))    
    g.add((Weapon, LAO['weaponDesc'], Literal(str(weaponDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimeWeapons.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 10.6 ms


## Premis Codes

with open(premisCodesData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(premisCodesData_parsed, 'w') as file:
    file.writelines(modified_lines)

In [11]:
# Load the CSV files in memory
premisCodes = pd.read_csv(premisCodesData_parsed, sep='>')
print(premisCodes.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Premis Cd                                   Premis Desc
0        221                                PUBLIC STORAGE
1        101                                        STREET
2        834          LA UNION STATION (NOT LINE SPECIFIC)
3        502  MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)
4        108                                   PARKING LOT


In [12]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in premisCodes.iterrows():
    
    # Extract information from the current row
    #print(f"{row['Premis Cd']} - {row['Premis Desc']}")

    premisCode = int(row['Premis Cd'])
    premisDesc = str(row['Premis Desc'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "premis"+str(premisCode)
    Premis = URIRef(LAO[idU])

    g.add((Premis, RDF.type, LAO.Premis))
    g.add((Premis, LAO['premisCode'], Literal((premisCode), datatype=XSD.int)))    
    g.add((Premis, LAO['premisDesc'], Literal(str(premisDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimePremis.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 80.6 ms


## Crimes

with open(crimeCodesDescData, 'r') as file:
    lines = file.readlines()

modified_lines = [line.replace(',', '>', 1) for line in lines]

with open(crimeCodesDescData_parsed, 'w') as file:
    file.writelines(modified_lines)

In [13]:
# Load the CSV files in memory
crimes = pd.read_csv(crimeCodesDescData_parsed, sep='>')
print(crimes.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

   Code              Description
0   110        CRIMINAL HOMICIDE
1   113  MANSLAUGHTER, NEGLIGENT
2   121           RAPE, FORCIBLE
3   122          RAPE, ATTEMPTED
4   210                  ROBBERY


In [14]:
%%time 
#measure execution time

#iterate over the laVictims1 dataframe
for index, row in crimes.iterrows():
    
    # Extract information from the current row
    #print(f"{row['Code']} - {row['Description']}")

    crimeCode = int(row['Code'])
    crimeDesc = str(row['Description'])
    
    # Create the node to add to the Graph
    # the node has the namespace + the modusOperandi Code as URI
    idU = "crime"+str(crimeCode)
    Crime = URIRef(LAO[idU])

    g.add((Crime, RDF.type, LAO.Crime))
    g.add((Crime, LAO['crimeCode'], Literal((crimeCode), datatype=XSD.int)))    
    g.add((Crime, LAO['crimeDesc'], Literal(str(crimeDesc), datatype=XSD.string)))    
    
print("--- saving serialization ---")
with open('crimeCrimesTypology.ttl', 'w') as file:
        file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 24.8 ms


## Crime Events

In [15]:
# Load the CSV files in memory
crimeEvents1 = pd.read_csv(crimeData1, sep=',', index_col='DR_NO')
crimeEvents2 = pd.read_csv(crimeData2, sep=',', index_col='DR_NO')
crimeEvents3 = pd.read_csv(crimeData3, sep=',', index_col='DR_NO')

crimeEvents = pd.concat([crimeEvents1, crimeEvents2, crimeEvents3])
crimeEvents['Date Rptd'] = pd.to_datetime(crimeEvents['Date Rptd'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
crimeEvents['DATE OCC'] = pd.to_datetime(crimeEvents['DATE OCC'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

print(crimeEvents.head())

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("lao", LAO)

            Date Rptd    DATE OCC  TIME OCC  AREA    AREA NAME  Rpt Dist No  \
DR_NO                                                                         
10304468   2020-01-08  2020-01-08      2230     3    Southwest          377   
190101086  2020-01-02  2020-01-01       330     1      Central          163   
200110444  2020-04-14  2020-02-13      1200     1      Central          155   
191501505  2020-01-01  2020-01-01      1730    15  N Hollywood         1543   
191921269  2020-01-01  2020-01-01       415    19      Mission         1998   

           Part 1-2  Crm Cd  \
DR_NO                         
10304468          2     624   
190101086         2     624   
200110444         2     845   
191501505         2     745   
191921269         2     740   

                                                 Crm Cd Desc         Mocodes  \
DR_NO                                                                          
10304468                            BATTERY - SIMPLE ASSAULT       0

In [16]:
%%time 
#measure execution time

#iterate over the club dataframe
for index, row in crimeEvents.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the club id as URI
    idU = "crimeEvent"+str(index)
    CrimeEvent = URIRef(LAO[idU])
    
    # Add triples using store's add() method.
    
    # TYPE
    g.add((CrimeEvent, RDF.type, LAO.CrimeEvent))

    # DATA PROPERTIES
    g.add((CrimeEvent, LAO['crimeId'], Literal(str(index), datatype=XSD.string)))

    if(pd.isna(row['TIME OCC']) == False):
        time_occ = str(row['TIME OCC'])
        if(len(time_occ) < 4):
            zeros_needed = 4 - len(time_occ)
            time_occ = '0' * zeros_needed + time_occ 
        # Extract hours and minutes
        hours = time_occ[:2]
        minutes = time_occ[2:]
        if(int(hours) > 23):
            print(f'hours format error > 23: {hours}')
            hours = 24
        if(int(minutes) > 59):
            print(f'minutes format error > 59: {minutes}')
        store_time = '1970-01-01T' + hours + ':' + minutes + ':00'
        #print(f'time_occ: {time_occ} - hours: {hours} - minutes: {minutes} - store_time: {store_time} \n')
        g.add((CrimeEvent, LAO['timeOccurred'], Literal(store_time, datatype=XSD.dateTime)))

    part = int(row['Part 1-2'])
    if (part == 2):
        g.add((CrimeEvent, LAO['reportedToFbi'], Literal(True, datatype=XSD.boolean)))
    else:    
        g.add((CrimeEvent, LAO['reportedToFbi'], Literal(False, datatype=XSD.boolean)))

    # OBJECT PROPERTIES
    if(pd.isna(row["Date Rptd"]) == False):
        g.add((CrimeEvent, LAO['reportedOnDate'], LAO['day' + str(row['Date Rptd'])]))
    
    if(pd.isna(row["DATE OCC"]) == False):
        g.add((CrimeEvent, LAO['occurredOnDate'], LAO['day' + str(row['DATE OCC'])]))
    
    if(pd.isna(row["Crm Cd"]) == False):
        g.add((CrimeEvent, LAO['isOfType'], LAO['crime' + str(row['Crm Cd'])]))
    
    if(pd.isna(row["Mocodes"]) == False):
        mocodes_list = row['Mocodes'].split()
        mocodes_integers = [int(code) for code in mocodes_list]
        for code in mocodes_integers:
            g.add((CrimeEvent, LAO['hasModusOperandi'], LAO['modusOperandi' + str(code)]))
    
    # Extract information about the victim from the current row
    victim_age = int(row['Vict Age'])
    if(victim_age < 0 or pd.isna((row['Vict Age']))):
        victim_age = 0
    
    victim_sex = str(row['Vict Sex'])
    if(victim_sex == '-' or pd.isna(row['Vict Sex'])):
        victim_sex = 'X'

    victim_descent = str(row['Vict Descent'])
    if(victim_descent == '-' or pd.isna(row['Vict Descent'])):
        victim_descent = 'X'
    
    g.add((CrimeEvent, LAO['occurredOnVictim'], LAO['victim-' + victim_sex + '-' + str(victim_age) + '-' + victim_descent]))
    
    if(pd.isna(row['Premis Cd']) == False):
        premis_code = int(row['Premis Cd'])
        g.add((CrimeEvent, LAO['hasPremis'], LAO['premis' + str(premis_code)]))

    if(pd.isna(row['Weapon Used Cd']) == False):
        weapon_code = int(row['Weapon Used Cd'])
        g.add((CrimeEvent, LAO['usedWeapon'], LAO['weapon' + str(weapon_code)]))

    if(pd.isna(row['LAT']) == False and pd.isna(row['LON']) == False):
        coordsURI = 'lat' + str(row['LAT']) + 'lon' + str(row['LON'])
        g.add((CrimeEvent, LAO['occurredInLocation'], LAO[coordsURI]))


print("--- saving serialization ---")
with open('crimeCrimeEvents.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 2min 30s
Wall time: 9min 10s
