In [1]:
import pandas as pd

In [2]:
crime = ['burglary', 'theft']
years = range(2012, 2015 + 1)

def get_data_crime(crime_type, year=2013):
    df_crime = pd.read_csv("./raw/crime.tar.gz", encoding="ISO-8859-1")
    del df_crime['Data type']
    del df_crime['Reference period of crime']
    del df_crime['Known offender identity']
    columns = list(df_crime.columns)
    columns[3] = 'count'
    df_crime.columns = columns
    df_crime = df_crime[
        (df_crime['Type of crime'].isin(crime_type)) & (df_crime['Select time'] == year)][["Territory", "count"]]
    return df_crime.groupby("Territory").sum()

def get_population(year):
    return list(get_population_(year)['population'])

def get_population_(year=2013):
    df = pd.read_csv("./raw/population.tar.gz", encoding="ISO-8859-1")
    columns = list(df.columns)
    columns[6] = 'population'
    df.columns = columns
    return df[(df['Marital status'] == 'total') &
              (df['Age'] == 'total') &
              (df['Gender'] == 'total') &
              (df['Select time'] == year)][["Territory", "population"]]

def get_population_and_crime(crime_type, year):
    crime_label = {'theft': ["pickpocketing", "moped theft", "theft from vehicle", "theft of art objets",
                             "theft of cargo trucks carrying freights", "thefts"],
                   'burglary': ["burglary"]}
    df = get_population_(year)
    df = df.set_index('Territory')
    df['crime'] = get_data_crime(crime_label[crime_type], year)
    df = df.reset_index()
    df.columns = ["region", "population", "crime"]
    return df

In [3]:
df_theft = get_population_and_crime("theft", 2014)
df_burglary = get_population_and_crime("burglary", 2014)

In [4]:
df_theft.head()

Unnamed: 0,region,population,crime
0,Novara,371686,10565.0
1,Torino,2297917,120124.0
2,Como,598810,13673.0
3,Lecco,340814,7530.0
4,Trento,536237,11652.0


In [5]:
df_burglary.head()

Unnamed: 0,region,population,crime
0,Novara,371686,1942.0
1,Torino,2297917,16500.0
2,Como,598810,3768.0
3,Lecco,340814,2082.0
4,Trento,536237,1832.0


Let's export but have the regions' names in pure ASCII to avoid potential issues later.

In [6]:
import anyascii

In [7]:
df_theft["region"] = df_theft.region.apply(anyascii.anyascii)
df_burglary["region"] = df_burglary.region.apply(anyascii.anyascii)

In [8]:
len(df_burglary.region), len(df_burglary.region.unique())

(111, 111)

In [9]:
df_burglary.to_csv("italy_burglary.csv", index=False)

In [10]:
df_theft.to_csv("italy_theft.csv", index=False)