# Project in Applied Data Science - Criminal Data
*Laura Kunz, Natalie Vintonjak, Yannick Sanner*

**SS 2024 - MSc Wirtschaftsinformatik** 


**Overview**



**Project Components**



Connect to the Swiss Open Data API to retrieve the Burglary Data for the Canton of Zurich. Afterwards load the Data into a Dataframe and save the originally retrieved data into csv, before the cleaning of the Data.

In [1]:
import requests    
import json         
import pandas as pd 
import os

package = 'anzahl-einbruche-nach-gemeinden-des-kantons-zurich'

# Base url for the open data swiss API
base_url = 'https://opendata.swiss/api/3/action/package_show?id='

# Construct the url including the package name  
package_information_url = base_url + package

# HTTP request
package_information = requests.get(package_information_url)

# Use the json module to load CKAN's response into a dictionary
package_dict = json.loads(package_information.content)

# Check the contents of the response.
assert package_dict['success'] is True  # again make sure if response is OK
package_dict = package_dict['result']   # we only need the 'result' part from the dictionary
print(package_dict)            

{'license_title': None, 'maintainer': 'Kantonspolizei des Kantons Zürich, Abteilung Kriminalpolizeiliches Datenmanagement', 'issued': '2023-03-22T10:32:20+01:00', 'title_for_slug': 'anzahl-einbruche-nach-gemeinden-des-kantons-zurich', 'qualified_relations': [], 'private': False, 'maintainer_email': 'kdm-kla@kapo.zh.ch', 'num_tags': 6, 'contact_points': [{'email': 'kdm-kla@kapo.zh.ch', 'name': 'Kantonspolizei des Kantons Zürich, Abteilung Kriminalpolizeiliches Datenmanagement'}], 'keywords': {'fr': [], 'de': ['strafbarehandlungen', 'kriminalitaet', 'straftaten', 'gemeinden', 'einbrueche', 'strafdelikte'], 'en': [], 'it': []}, 'temporals': [{'start_date': '2009-01-01T00:00:00', 'end_date': '2023-12-31T23:59:59.999999'}], 'id': '7df9b9e1-67ba-4402-881c-1bea8ab732be', 'metadata_created': '2023-03-23T01:45:39.242565', 'documentation': [], 'conforms_to': [], 'metadata_modified': '2024-04-30T03:27:28.148545', 'author': None, 'author_email': None, 'isopen': False, 'relations': [{'url': 'https:

In [2]:
# Get the url for the data from the dictionary
data_url = package_dict['resources'][0]['url']
print('Data url:' + data_url)

# Print the data format
data_format = package_dict['resources'][0]['format']
print('Data format:' + data_format)

Data url:https://www.web.statistik.zh.ch/ogd/daten/ressourcen/KTZH_00002042_00004083.csv
Data format:CSV


In [3]:
csv = ['comma-separated-values', 'CSV', 'csv']

if any(s in data_format for s in csv):     
     Einbrueche_df = pd.read_csv(data_url)
else:
    print('Sorry, the data format is not supported')
Einbrueche_df

Unnamed: 0,Ausgangsjahr,Gemeinde_BFS_Nr,Gemeindename,Stadtkreis_BFS_Nr,Stadtkreis_Name,Gesetz_Nummer,Gesetz_Abk,Tatbestand,Straftaten_total,Straftaten_vollendet,Straftaten_versucht,Einwohner,Häufigkeitszahl
0,2009,131,Adliswil,,,311.0,StGB,Einbruchdiebstahl,159,114,45,16052.0,9.9
1,2009,131,Adliswil,,,311.0,StGB,Einschleichdiebstahl,33,32,1,16052.0,2.1
2,2009,131,Adliswil,,,311.0,StGB,Einbrüche insgesamt,192,146,46,16052.0,12.0
3,2009,241,Aesch,,,311.0,StGB,Einbruchdiebstahl,10,7,3,987.0,10.1
4,2009,241,Aesch,,,311.0,StGB,Einschleichdiebstahl,2,2,0,987.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7777,2023,261,Zürich,,unbekannt,311.0,StGB,Einschleichdiebstahl,0,0,0,1601.0,
7778,2023,261,Zürich,,unbekannt,311.0,StGB,Einbrüche insgesamt,0,0,0,1601.0,
7779,2023,7601,unbekannt ZH,,,311.0,StGB,Einbruchdiebstahl,1,1,0,,
7780,2023,7601,unbekannt ZH,,,311.0,StGB,Einschleichdiebstahl,1,1,0,,


In [4]:
#Save original data files to csv, if the folder is not aleady created, it will be created
directory = 'Original_CSV_files'
current_directory = os.getcwd()

csv_path = os.path.join(current_directory, directory, 'Einbrueche.csv')

# Create the directory if it doesn't exist
if not os.path.exists(os.path.join(current_directory, directory)):
    os.makedirs(os.path.join(current_directory, directory))

Einbrueche_df.to_csv(csv_path, index=False)
print(Einbrueche_df)

      Ausgangsjahr  Gemeinde_BFS_Nr  Gemeindename  Stadtkreis_BFS_Nr  \
0             2009              131      Adliswil                NaN   
1             2009              131      Adliswil                NaN   
2             2009              131      Adliswil                NaN   
3             2009              241         Aesch                NaN   
4             2009              241         Aesch                NaN   
...            ...              ...           ...                ...   
7777          2023              261        Zürich                NaN   
7778          2023              261        Zürich                NaN   
7779          2023             7601  unbekannt ZH                NaN   
7780          2023             7601  unbekannt ZH                NaN   
7781          2023             7601  unbekannt ZH                NaN   

     Stadtkreis_Name  Gesetz_Nummer Gesetz_Abk            Tatbestand  \
0                NaN          311.0       StGB     Einbruchdieb

Drop not needed Columns, Drop NA, Delete the summarized Datarows and Summarize Straftaten_total for each Gemeinde for the year 2023 to check if the retrival was done correctly. Afterwards save a Cleaned csv file to the Cleaned CSV Files folder. 

In [5]:
Einbrueche_df.drop(['Stadtkreis_BFS_Nr','Stadtkreis_Name','Gesetz_Nummer','Gesetz_Abk','Häufigkeitszahl'], axis=1, inplace=True)
Einbrueche_df

Unnamed: 0,Ausgangsjahr,Gemeinde_BFS_Nr,Gemeindename,Tatbestand,Straftaten_total,Straftaten_vollendet,Straftaten_versucht,Einwohner
0,2009,131,Adliswil,Einbruchdiebstahl,159,114,45,16052.0
1,2009,131,Adliswil,Einschleichdiebstahl,33,32,1,16052.0
2,2009,131,Adliswil,Einbrüche insgesamt,192,146,46,16052.0
3,2009,241,Aesch,Einbruchdiebstahl,10,7,3,987.0
4,2009,241,Aesch,Einschleichdiebstahl,2,2,0,987.0
...,...,...,...,...,...,...,...,...
7777,2023,261,Zürich,Einschleichdiebstahl,0,0,0,1601.0
7778,2023,261,Zürich,Einbrüche insgesamt,0,0,0,1601.0
7779,2023,7601,unbekannt ZH,Einbruchdiebstahl,1,1,0,
7780,2023,7601,unbekannt ZH,Einschleichdiebstahl,1,1,0,


In [6]:
indizes_zu_loeschen = Einbrueche_df[Einbrueche_df['Tatbestand'] == 'Einbrüche insgesamt'].index
Einbrueche_df.drop(indizes_zu_loeschen, inplace=True)
Einbrueche_df

Unnamed: 0,Ausgangsjahr,Gemeinde_BFS_Nr,Gemeindename,Tatbestand,Straftaten_total,Straftaten_vollendet,Straftaten_versucht,Einwohner
0,2009,131,Adliswil,Einbruchdiebstahl,159,114,45,16052.0
1,2009,131,Adliswil,Einschleichdiebstahl,33,32,1,16052.0
3,2009,241,Aesch,Einbruchdiebstahl,10,7,3,987.0
4,2009,241,Aesch,Einschleichdiebstahl,2,2,0,987.0
6,2009,1,Aeugst,Einbruchdiebstahl,3,3,0,1700.0
...,...,...,...,...,...,...,...,...
7774,2023,261,Zürich,Einschleichdiebstahl,18,17,1,31471.0
7776,2023,261,Zürich,Einbruchdiebstahl,0,0,0,1601.0
7777,2023,261,Zürich,Einschleichdiebstahl,0,0,0,1601.0
7779,2023,7601,unbekannt ZH,Einbruchdiebstahl,1,1,0,


In [7]:
indizes_zu_loeschen = Einbrueche_df[Einbrueche_df['Gemeindename'] == 'unbekannt ZH'].index
Einbrueche_df.drop(indizes_zu_loeschen, inplace=True)
Einbrueche_df

Unnamed: 0,Ausgangsjahr,Gemeinde_BFS_Nr,Gemeindename,Tatbestand,Straftaten_total,Straftaten_vollendet,Straftaten_versucht,Einwohner
0,2009,131,Adliswil,Einbruchdiebstahl,159,114,45,16052.0
1,2009,131,Adliswil,Einschleichdiebstahl,33,32,1,16052.0
3,2009,241,Aesch,Einbruchdiebstahl,10,7,3,987.0
4,2009,241,Aesch,Einschleichdiebstahl,2,2,0,987.0
6,2009,1,Aeugst,Einbruchdiebstahl,3,3,0,1700.0
...,...,...,...,...,...,...,...,...
7771,2023,261,Zürich,Einschleichdiebstahl,99,94,5,73993.0
7773,2023,261,Zürich,Einbruchdiebstahl,63,49,14,31471.0
7774,2023,261,Zürich,Einschleichdiebstahl,18,17,1,31471.0
7776,2023,261,Zürich,Einbruchdiebstahl,0,0,0,1601.0


In [8]:
Einbrueche_df.dropna(subset=['Einwohner'], inplace=True)
Einbrueche_df

Unnamed: 0,Ausgangsjahr,Gemeinde_BFS_Nr,Gemeindename,Tatbestand,Straftaten_total,Straftaten_vollendet,Straftaten_versucht,Einwohner
0,2009,131,Adliswil,Einbruchdiebstahl,159,114,45,16052.0
1,2009,131,Adliswil,Einschleichdiebstahl,33,32,1,16052.0
3,2009,241,Aesch,Einbruchdiebstahl,10,7,3,987.0
4,2009,241,Aesch,Einschleichdiebstahl,2,2,0,987.0
6,2009,1,Aeugst,Einbruchdiebstahl,3,3,0,1700.0
...,...,...,...,...,...,...,...,...
7771,2023,261,Zürich,Einschleichdiebstahl,99,94,5,73993.0
7773,2023,261,Zürich,Einbruchdiebstahl,63,49,14,31471.0
7774,2023,261,Zürich,Einschleichdiebstahl,18,17,1,31471.0
7776,2023,261,Zürich,Einbruchdiebstahl,0,0,0,1601.0


In [9]:
df_2023 = Einbrueche_df[Einbrueche_df['Ausgangsjahr'] == 2023]
# Summarize Straftaten_total for each Gemeinde for the year 2023
summary_df_2023 = df_2023.groupby(['Gemeinde_BFS_Nr', 'Gemeindename'])['Straftaten_total'].sum().reset_index()
summary_df_2023

Unnamed: 0,Gemeinde_BFS_Nr,Gemeindename,Straftaten_total
0,1,Aeugst,7
1,2,Affoltern,54
2,3,Bonstetten,11
3,4,Hausen,12
4,5,Hedingen,19
...,...,...,...
154,294,Elgg,16
155,295,Horgen,50
156,296,Illnau-Effretikon,57
157,297,Bauma,9


In [10]:
directory = 'Cleaned_CSV_files'
current_directory = os.getcwd()

csv_path = os.path.join(current_directory, directory, 'Einbrueche_Cleaned.csv')

# Create the directory if it doesn't exist
if not os.path.exists(os.path.join(current_directory, directory)):
    os.makedirs(os.path.join(current_directory, directory))

Einbrueche_df.to_csv(csv_path, index=False)
print(Einbrueche_df)

      Ausgangsjahr  Gemeinde_BFS_Nr Gemeindename            Tatbestand  \
0             2009              131     Adliswil     Einbruchdiebstahl   
1             2009              131     Adliswil  Einschleichdiebstahl   
3             2009              241        Aesch     Einbruchdiebstahl   
4             2009              241        Aesch  Einschleichdiebstahl   
6             2009                1       Aeugst     Einbruchdiebstahl   
...            ...              ...          ...                   ...   
7771          2023              261       Zürich  Einschleichdiebstahl   
7773          2023              261       Zürich     Einbruchdiebstahl   
7774          2023              261       Zürich  Einschleichdiebstahl   
7776          2023              261       Zürich     Einbruchdiebstahl   
7777          2023              261       Zürich  Einschleichdiebstahl   

      Straftaten_total  Straftaten_vollendet  Straftaten_versucht  Einwohner  
0                  159          

Create a new Database (if it is not existing already) called CriminalDataDB in mysql and connect to it. 

In [11]:
import mysql.connector
from mysql.connector import Error

try:
    connection = mysql.connector.connect(host='localhost',
                                         user='admin',
                                         password='Criminal1234')

    if connection.is_connected():
        print("Connected to MySQL server")

        # Create a cursor object to execute SQL queries
        cursor = connection.cursor()

        # Execute SQL statement to create a new database
        cursor.execute("CREATE DATABASE IF NOT EXISTS CriminalDataDB")

        print("Database created successfully")

except Error as e:
    print("Error connecting to MySQL:", e)

finally:
    if connection.is_connected():
        connection.close()
        print("MySQL connection closed")


Connected to MySQL server
Database created successfully
MySQL connection closed


Print the Information for the Dataframe so we can create the Table for the Database correspondingly.

In [12]:
Einbrueche_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5152 entries, 0 to 7777
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Ausgangsjahr          5152 non-null   int64  
 1   Gemeinde_BFS_Nr       5152 non-null   int64  
 2   Gemeindename          5152 non-null   object 
 3   Tatbestand            5152 non-null   object 
 4   Straftaten_total      5152 non-null   int64  
 5   Straftaten_vollendet  5152 non-null   int64  
 6   Straftaten_versucht   5152 non-null   int64  
 7   Einwohner             5152 non-null   float64
dtypes: float64(1), int64(5), object(2)
memory usage: 362.2+ KB


Connect to the database and create a new table called Einbrueche. If the data is already existing drop it.

In [13]:
import mysql.connector
from mysql.connector import Error
import pandas as pd

# MySQL connection parameters
host = 'localhost'
user = 'admin'
password = 'Criminal1234'
database = 'CriminalDataDB'  
try:
    connection = mysql.connector.connect(host=host,
                                         user=user,
                                         password=password,
                                         database=database)

    if connection.is_connected():
        print("Connected to MySQL server")
        cursor = connection.cursor()

        # SQL command to drop the table if it already exists becuase before that line of code the data was already inserted multiple times
        drop_table_query = "DROP TABLE IF EXISTS Einbrueche;"
        cursor.execute(drop_table_query)
        print("Table 'Einbrueche' dropped if it existed alreadyy")

        create_table_query = """
        CREATE TABLE IF NOT EXISTS Einbrueche (
            Ausgangsjahr INT,
            Gemeinde_BFS_Nr INT,
            Gemeindename VARCHAR(255),
            Tatbestand VARCHAR(255),
            Straftaten_total INT,
            Straftaten_vollendet INT,
            Straftaten_versucht INT,
            Einwohner FLOAT
        )
        """

        cursor.execute(create_table_query)
        print("Table 'Einbrueche' created successfully")

except Error as e:
    print("Error connecting to MySQL:", e)

finally:
    if connection.is_connected():
        # Close cursor and connection
        cursor.close()
        connection.close()
        print("MySQL connection closed")


Connected to MySQL server
Table 'Einbrueche' dropped if it existed alreadyy
Table 'Einbrueche' created successfully
MySQL connection closed


Insert the Cleanaed data from the Cleaned Csv file which was created before. 

In [14]:
import mysql.connector 
from mysql.connector import Error
import pandas as pd

# MySQL connection parameters
connection_params = {
    'host': 'localhost',
    'user': 'admin',
    'password': 'Criminal1234',
    'database': 'CriminalDataDB',
    'allow_local_infile': True
}

directory = 'Cleaned_CSV_files'
current_directory = os.getcwd()
csv_file_path = os.path.join(current_directory, directory, 'Einbrueche_Cleaned.csv')

try:
    with mysql.connector.connect(**connection_params) as connection:
        print("Connected to MySQL server")

        Einbrueche_df_to_sql = pd.read_csv(csv_file_path)

        cursor = connection.cursor()
        insert_query = """
        LOAD DATA LOCAL INFILE %s 
        INTO TABLE Einbrueche 
        FIELDS TERMINATED BY ',' 
        ENCLOSED BY '"' 
        LINES TERMINATED BY '\n' 
        IGNORE 1 LINES
        """

        cursor.execute(insert_query, (csv_file_path,))
        connection.commit()

        print("Data from CSV file successfully inserted into MySQL table 'Einbrueche'")

except Error as e:
    print("Error connecting to MySQL:", e)

finally:
    if connection.is_connected():
        # Close cursor and connection
        cursor.close()
        connection.close()
        print("MySQL connection closed")


Connected to MySQL server
Data from CSV file successfully inserted into MySQL table 'Einbrueche'


Check if the Data upload to the table was successfully by creating a query to the sql table. 

In [15]:
import mysql.connector
from mysql.connector import Error

# MySQL connection parameters
connection_params = {
    'host': 'localhost',
    'user': 'admin',
    'password': 'Criminal1234',
    'database': 'CriminalDataDB'
}

try:
    connection = mysql.connector.connect(**connection_params)
    print("Connected to MySQL server")

    with connection.cursor() as cursor:
        select_query = "SELECT * FROM Einbrueche"

        cursor.execute(select_query)
        rows = cursor.fetchall()

        for row in rows:
            print(row)

except Error as e:
    print("Error connecting to MySQL:", e)

finally:
    if 'connection' in locals() and connection.is_connected():
        # Close connection
        connection.close()
        print("MySQL connection closed")

Connected to MySQL server
(2009, 131, 'Adliswil', 'Einbruchdiebstahl', 159, 114, 45, 16052.0)
(2009, 131, 'Adliswil', 'Einschleichdiebstahl', 33, 32, 1, 16052.0)
(2009, 241, 'Aesch', 'Einbruchdiebstahl', 10, 7, 3, 987.0)
(2009, 241, 'Aesch', 'Einschleichdiebstahl', 2, 2, 0, 987.0)
(2009, 1, 'Aeugst', 'Einbruchdiebstahl', 3, 3, 0, 1700.0)
(2009, 1, 'Aeugst', 'Einschleichdiebstahl', 5, 5, 0, 1700.0)
(2009, 2, 'Affoltern', 'Einbruchdiebstahl', 101, 67, 34, 10630.0)
(2009, 2, 'Affoltern', 'Einschleichdiebstahl', 20, 18, 2, 10630.0)
(2009, 211, 'Altikon', 'Einbruchdiebstahl', 0, 0, 0, 615.0)
(2009, 211, 'Altikon', 'Einschleichdiebstahl', 1, 1, 0, 615.0)
(2009, 291, 'Andelfingen', 'Einbruchdiebstahl', 12, 10, 2, 2775.0)
(2009, 291, 'Andelfingen', 'Einschleichdiebstahl', 1, 1, 0, 2775.0)
(2009, 51, 'Bachenbülach', 'Einbruchdiebstahl', 27, 19, 8, 3796.0)
(2009, 51, 'Bachenbülach', 'Einschleichdiebstahl', 10, 9, 1, 3796.0)
(2009, 81, 'Bachs', 'Einbruchdiebstahl', 1, 0, 1, 566.0)
(2009, 81, 'Bac