# Variables to use
## The complete set of variables could be found on the 'EDA' notebook. There we also specified the reason why we use this subset of variables

## Inputs

<table>
    <tr>
        <th style="text-align: center;">Variable</td>
        <th style="text-align: center;">Description</td>
    </tr>
    <tr>
        <td style="text-align: center;">Agency Code</td>
        <td style="text-align: center;">6 digits alphanumeric code of the agency</td>
    </tr>
    <tr>
        <td style="text-align: center;">Agency Type</td>
        <td style="text-align: center;">The type of agency</td>
    </tr>
    <tr>
        <td style="text-align: center;">State</td>
        <td style="text-align: center;">The name of the state where the homicide happened</td>
    </tr>
    <tr>
        <td style="text-align: center;">Year</td>
        <td style="text-align: center;">The year of the homicide</td>
    </tr>
    <tr>
        <td style="text-align: center;">Month</td>
        <td style="text-align: center;">The month of the homicide</td>
    </tr>
    <tr>
        <td style="text-align: center;">Crime Type</td>
        <td style="text-align: center;">The type of crime</td>
    </tr>
    <tr>
        <td style="text-align: center;">Victim Sex</td>
        <td style="text-align: center;">The sex of the victim</td>
    </tr>
    <tr>
        <td style="text-align: center;">Victim Age</td>
        <td style="text-align: center;">The age of the victim</td>
    </tr>
    <tr>
        <td style="text-align: center;">Victim Race</td>
        <td style="text-align: center;">The race of the victim</td>
    </tr>
    <tr>
        <td style="text-align: center;">Victim Ethnicity</td>
        <td style="text-align: center;">The ethnicity of the victim</td>
    </tr>
    <tr>
        <td style="text-align: center;">Weapon</td>
        <td style="text-align: center;">The weapon used</td>
    </tr>
</table>

## Output

<table>
    <tr>
        <td style="text-align: center;">**Crime Solved**</td>
        <td style="text-align: center;">Indicates whether the crime has been solved or not</td>
    </tr>
</table>

# Preprocessing

In [None]:
import numpy as np
import pandas as pd
import zipfile

# Extract the dataset
zipfile.ZipFile(file='dataset.zip', mode='r').extractall()

ds = pd.read_csv('dataset.csv')

# Drop unused columns
ds.drop(axis=1, inplace=True, labels=['Record ID', 'Agency Name', 'City', 'Incident', 
        'Perpetrator Sex', 'Perpetrator Age', 'Perpetrator Race', 'Perpetrator Ethnicity', 
        'Relationship', 'Victim Count', 'Perpetrator Count', 'Record Source'])

# Column rename
BETTER_COLUMNS_NAMES = {
    'Agency Code': 'agency_code',
    'Agency Type': 'agency_type',
    'State': 'state',
    'Year': 'year',
    'Month': 'month',
    'Crime Type': 'crime_type',
    'Crime Solved': 'crime_solved',
    'Victim Sex': 'victim_sex',
    'Victim Age': 'victim_age',
    'Victim Race': 'victim_race',
    'Victim Ethnicity': 'victim_ethnicity',
    'Weapon': 'weapon',
}

ds.rename(columns=BETTER_COLUMNS_NAMES, inplace=True)

ds.columns

# Splitting the dataset in train, test and validation

In [None]:
from sklearn.model_selection import train_test_split

# 60% train, 30% test, 10% validation
train, not_train = train_test_split(ds, test_size=0.4)
validation, test = train_test_split(not_train, test_size=0.75)

# First we define a generic function to evaluate the different models

In [None]:
from sklearn.metrics import precision_score, confusion_matrix
from matplotlib import pyplot as plt
%matplotlib inline
def evaluate_model(model, extract_inputs_outputs, include_validation):
    sets = [('train', train), ('test', test)]
    if (include_validation):
        sets.append(('validation', validation))
    
    for set_name, set_data in sets:
        inputs, outputs = extract_inputs_outputs(set_data)        
        
        predictions = model.predict(inputs)
        
        predictions = predictions.astype(np.float64)
        
        print('Model evaluation on dataset:' + set_name)
        
        ###
        print(outputs[:3])
        print(predictions[:3])
        
        print('Precision score:' + precision_score(outputs, predictions))
        
        plt.figure(figsize=(3,4))
        
        plt.xticks([0, 1], ['not solved', 'solved'], rotation=45)
        plt.yticks([0, 1], ['not solved', 'solved'])
        plt.xlabel('Predicted class')
        plt.ylabel('True class')

        plt.title(set_name)

        plt.imshow(
            confusion_matrix(outputs, predictions),
            cmap=plt.cm.Blues,
            interpolation='nearest',
        )

        plt.show()
        

# First attempt: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
ds.columns

## Transforming categorical variables

In [None]:
import datetime as d

def lr_extract_inputs_outputs(dataset):
    ds = dataset.copy()
    
    # month format
    ds.month = ds.month.apply(lambda x: d.datetime.strptime(x[:3], '%b').month)
    
    # agency code
    ds.drop(columns=['agency_code'], inplace=True)
    
    # agency type
    for agency_type in ds.agency_type.unique():
        ds[agency_type.lower().replace(' ', '_')] = (ds.agency_type == agency_type)
    ds.drop(columns=['agency_type'], inplace=True)
    
    # state
    for state in ds.state.unique():
        ds[state.lower().replace(' ', '_')] = (ds.state == state)
    ds.drop(columns=['state'], inplace=True)
        
    # crime type
    for crime_type in ds.crime_type.unique():
        ds[crime_type.lower().replace(' ', '_')] = (ds.crime_type == crime_type)
    ds.drop(columns=['crime_type'], inplace=True)
        
    # victim sex
    ds = ds[ds.victim_sex != 'Unknown']
    for victim_sex in ds.victim_sex.unique():
        ds[victim_sex.lower().replace(' ', '_')] = (ds.victim_sex == victim_sex)
    ds.drop(columns=['victim_sex'], inplace=True)
        
    # victim age
    ds = ds[ds.victim_age != 998]

    # victim race
    ds = ds[ds.victim_race != 'Unknown']
    for victim_race in ds.victim_race.unique():
        ds[victim_race.lower().replace(' ','_')] = (ds.victim_race == victim_race)
    ds.drop(columns=['victim_race'], inplace=True)

    # victim ethnicity
    ds.drop(columns=['victim_ethnicity'], inplace=True)

    # weapon
    ds = ds[ds.weapon != 'Unknown']
    # join various weapon types into one
    guns = ['Handgun', 'Firearm', 'Shotgun']
    for gun in guns:
        ds[ds.weapon == gun].weapon = 'Gun'

    for weapon in ds.weapon.unique():
        ds[weapon.lower().replace(' ', '_')] = (ds.weapon == weapon)
    ds.drop(columns=['weapon'], inplace=True)
    
    print(ds.head())
    
    # crime_solved
    ds['crime_solved'] = (ds.crime_solved == 'Yes')
    
    inputs = ds.drop(columns=['crime_solved'])
    outputs = ds.crime_solved
    
    return [inputs.values.astype(np.float64), outputs.values.astype(np.float64)]

## Model

In [None]:
lr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

In [None]:
inputs, outputs = lr_extract_inputs_outputs(train)

lr_model.fit(inputs, outputs)

In [None]:
evaluate_model(lr_model, lr_extract_inputs_outputs, include_validation=True)

In [None]:
# Germi & Nacho
# Impriman lr_extract_inputs[0] y lr_extract_inputs[1] a ver que sale..
# Prueben hacer predicciones (por fuera de evaluate model) e imprimanlas..
# Investiguen qué tipos de datos espera precision_score()