## Import libraries 

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import dash 
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.graph_objs as go
import plotly.offline as pyo
import dash_bootstrap_components as dbc
import re
import json

## Read data 

In [None]:
df_total = pd.read_csv('merged_data.csv')

In [None]:
# # # dataframe that only contains files with persons information
df = df_total[df_total['Id_person'].notna()]

# Data cleaning and statistical analysis

In [None]:
# # # Check missing values
msno.matrix(df)

### Data pre-processing for punishment 

#### Overview of punishment 

In [None]:
# # # Check missing values
df['punishment(fand)/Punishmandts/Pandalties'].notna().value_counts()

In [None]:
x = 413/2122
x

In [None]:
np.sort(df['punishment(fand)/Punishmandts/Pandalties'].dropna().unique())

In [None]:
# # # key words for each main category of punishment:
key_words = {'dukats': 'fine', 
             'fine':'fine', 
             'disciplinary': 'prison', 
             'God':'ask forgiveness from God and the Justice', 
             'forgiving':'ask forgiveness from God and the Justice',
             'ban':'banishment',
             'absent basket':'branding (with a hot iron mark)',
             '; absent':'branding (with a hot iron mark)',
             'absent;':'branding (with a hot iron mark)',
             'absent,':'branding (with a hot iron mark)', 
             'brand':'branding (with a hot iron mark)',
             'floggging':'floggging',
             'acquit':'acquittal',
             'out of':'out of the city',
             'office': 'expiration of office', 
             'cost':'costs',
             'confiscation of property':'confiscation of property',
             'teased':'pillory','pillory':'pillory',
             'tooth':'pillory',
             'Lean Soup':'prison',
             'city running':'city running',
             'composible':'diverse',
             'conditional':'conditional release',
             'confine':'confinement',
             'death':'death penalty',
             'straightened':'death penalty',
             'cut off hand':'cut off hand',
             'baked':'baked','hanging':'hanging',
             'decapitation':'decapitation',
             'discharged':'discharged',
             'dismiss':'dismissed',
             'drag':'dragged',
             'display':'on display',
             'exhibit':'on display',
             'show':'on display',
             'pilgrimage':'pilgrimage',
             'prison':'prison',
             'water and bread':'prison',
             'remand':'remand',
             'spin house':'prison',
             'Gewandhuis':'around the Gewandhuis'
            }

In [None]:
punish_categories = ['fine', 
       'acquittal', 'death penalty', 'banishment',
       'branding (with a hot iron mark)', 'on display', 'costs',
       'decapitation', 'prison', 'dismissed', 'discharged', 'hanging',
       'floggging', 'baked',  'dragged', 'pillory',
       'out of the city', 'expiration of office', 'confinement', 'remand',
       'confiscation of property', 'cut off hand', 'diverse',
       'around the Gewandhuis', 'ask forgiveness from God and the Justice',
       'city running', 'conditional release', 'pilgrimage']

In [None]:
# # # Create a new dataframe, named "df1":
df1 = df.copy()
for i in punish_categories:
    df1[i] = 0

for i in range(0, len(df1)):
    punishment = str(df1.loc[int(i), 'punishment(fand)/Punishmandts/Pandalties'])
    if punishment == 'nan':
        for k in punish_categories:
            df1.loc[i,k] = float("nan")
        continue
    for j in key_words.keys():
        if j in punishment:
            df1.loc[i, key_words[j]] = 1

In [None]:
# # # count each catetory
df1[punish_categories].sum().sort_values(ascending=False)

#### Fine 

In [None]:
fine = df1[df1['fine'] == 1].reset_index()[['Id_person','punishment(fand)/Punishmandts/Pandalties']]
fine['money'] = fine['punishment(fand)/Punishmandts/Pandalties'].apply(lambda x: re.findall(r'fine\s-\s(\d+\s..)', x))

for i in range(0,len(fine)):
    try:
        fine.loc[i,'amount'] = fine.loc[i,'money'][0]
    except:
        print(i)
        print(fine.loc[i,'punishment(fand)/Punishmandts/Pandalties'])
fine.loc[36,'amount'] = '200 µl'

### Data pre-processing for Crimes 

In [None]:
# # # key words for each main category of Crime:
key_words_crimes = {'killing': 'Killing',
      'burglary': 'Burglary',
      'threat' : 'Threat',
      'maltreatment' : 'Ill-treatment',
      'criminal' : 'Assault',
      'extortion' : 'Extortion',
      'outlawry' : 'Outlawry', #not obeying the ban, sentence of outlawry
      'begging' : 'Begging',
      'fraud' : 'Fraud',
      'forgery' : 'Fraud', 
      'forged': 'Fraud',
      'theft': 'Theft',
      'violence' : 'Violence', # 3, (street) vandalism and violence
      'trading ' : 'Trading with enemy',
      'enemy' : 'Trading with enemy',
      'prostitution' : 'Prostitution', #2
      'evasion' : 'Evasion', # evasion impost, evasion impost on beer, tax evasion, tax-dodging
      'escape' : 'Escape', # letting escape of a prisoner, illegal freeing pounded sheep, 
      #'freeing' : 'Escape', 
       'kidnapping' : 'Kidnapping',
       'cheating' : 'Cheating', # cheating (cardplay, dices),swindle, cheating (in case of changing money),swindle, cheating
       'violating' : 'Violating',
       'looting' : 'Looting', #??? ip_crime: 14, 55
        'destruction' : 'Destruction',
        'destroying' : 'Destruction',
        'digging' : 'Destruction',
        'forbidden' : 'Forbidden',
        'unallowd' : 'Forbidden',
        #'possession' : 'Forbidden',
        #'embezzle' : 'Embezzle',
        'resistance' : 'Resistance',
        'refusing' : 'Refusing', #4: niet reinigen Blok,refusing to billet troops in a house,refusing to clean the public street,
        #refusing to make a(n incriminating / implicating), 
        'suicide' : 'Suicide',
        'adultery': 'Adultery',
        'bigamy' : 'Bigamy',
        'robbery': 'Robbery',
        'pocketing' : 'Robbery',
         'barrage' : 'Barrage',#2
         'disturb': 'Disturb public order',
         'revolt' : 'Disturb public order',
         'mob' : 'Disturb public order',
         "one's" : 'Disturb public order',
         'hooliganism' : 'Robbery',
         'offending' : 'Offending', 
         'illegal' : 'Illegal'}

In [None]:
crime_categories = ['Illegal', 'Offending', 'Disturb public order', 'Robbery','Barrage','Bigamy',
                    'Adultery', 'Suicide', 'Refusing', 'Resistance', 'Forbidden', 
                    'Destruction', 'Looting','Violating','Cheating', 'Kidnapping', 'Escape',
                    'Evasion', 'Prostitution', 'Trading with enemy', 'Violence', 'Theft', 'Fraud',
                     'Begging', 'Outlawry', 'Extortion', 'Assault', 'Ill-treatment', 'Threat', 'Burglary','Killing']#'Embezzle'

In [None]:
for i in crime_categories:
    df1[i] = 0
for i in range(0, len(df1)):
    crime = str(df1.loc[int(i), 'Misdrijven/Crimes'])
    if crime == 'nan':
        for k in crime_categories:
            df1.loc[i,k] = float("nan")
        continue
    for j in key_words_crimes.keys():
        if j in crime:
            df1.loc[i, key_words_crimes[j]] = 1
#         else:
#             df1.loc[i, key_words_crimes[j]] = 0

In [None]:
# # # count each catetory
df1[crime_categories].sum().sort_values(ascending=False)

In [None]:
df1.columns

In [None]:
df1['torture/Tortured'] = df1['torture/Tortured'].fillna('untortured')

## Data visualization 

### Dashboard

In [None]:
# # # # graphs

# # # The first graph
# # The overview of punishment : bar chart

# Content for the first graph
def fig1():
    punishment = df1[punish_categories].sum().sort_values(ascending=True).apply(lambda x: int(x))
    data = [go.Bar(
           y = punishment.index,
           x = punishment.values,
           orientation = 'h',
            )]
    layout = go.Layout(title = 'Bar chart of punishment',
#                        yaxis = {'title': 'punishment'},
                       xaxis = {'title': 'counts', "tickformat": ",d"},
                       height=800)
    return {'data': data, 'layout': layout}

first_graph = dbc.Row([
        dbc.Col(
            dcc.Graph( id='graph_1',
                     figure=fig1()), 
           )
    ])


# # # The second graph
# # What kinds of crimes may lead to the fine punishment?
second_graph = dbc.Row([
        dbc.Col(
            dcc.Graph( id='graph_2',), 
           )
    ])




# # # The third graph
# # The distribution of the amounts of money in fine
def fig3():
    data = [go.Bar(
            x = fine['amount'].value_counts().sort_values(ascending=False).index,
            y = fine['amount'].value_counts().sort_values(ascending=False).values
    )]
    layout = go.Layout(title = 'Bar chart of fine',
                       xaxis = {'title': 'fine'},
                       yaxis = {'title': 'counts', "tickformat": ",d"})
    return {'data': data, 'layout': layout}

third_graph = dbc.Row([
        dbc.Col(
            dcc.Graph( id='graph_3',
                     figure=fig3()), 
           )
    ])



# # # The fourth graph
# # Which crimes may lead to torture?
def fig4():
    crimes = df1.groupby('torture/Tortured').sum()[crime_categories].T
    crimes['total'] = crimes['Subject to torture']+crimes['untortured']
    crimes = crimes.sort_values('total', ascending = False)
    crimes = crimes.reset_index()
    data = [go.Bar(
           x = crimes["index"],
           y = crimes["untortured"],
           name = 'Untortured',
           offsetgroup=0,
            ),
        go.Bar(
           x = crimes["index"],
           y = crimes["Subject to torture"],
           name = 'Tortured',
           offsetgroup=0,
           base = crimes["untortured"],
            ),
          
           ]
    layout = go.Layout(title = 'Bar chart of crimes and whether tortured',
                       xaxis = {'title': 'crimes'},
                       yaxis = {'title': 'counts', "tickformat": ",d"})
    return {'data': data, 'layout': layout}

fourth_graph = dbc.Row([
        dbc.Col(
            dcc.Graph( id='graph_4',
                     figure=fig4()), 
           )
    ])




# # # # dashboard
# # # Create the dashboard
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])
app.layout = html.Div([dcc.Location(id="url"), 
                      html.Div(
                            [
                              dbc.Row([first_graph,
                               second_graph,]),
                               third_graph,
                               fourth_graph,
                            ],

                            )
                      ])


# # # # callback
# # # for the third graph


@app.callback(Output('graph_2', 'figure'),
             [Input('graph_1', 'clickData')])
def update_second_graph(clickData):
    feature = clickData["points"][0]["label"]
    data_df = df1[df1[feature] == 1][crime_categories].sum().sort_values(ascending=True).apply(lambda x: int(x))
    data = [go.Bar(
            y = data_df.index,
            x = data_df.values,
            orientation = 'h',
    )]
    layout = go.Layout(title = 'What kinds of crimes lead to {}'.format(feature),
#                        yaxis = {'title': 'crimes'},
                       xaxis = {'title': 'counts', "tickformat": ",d"},
                       height=800)
    return {'data': data, 'layout': layout}
                       


if __name__ == '__main__':
    app.run_server(port='8000')
