In [3]:
import pandas as pd
import numpy as np
np.random.seed(0)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# building model 'josh' with gradient boosting

### considerations:
- heavily subset: 364,000 observations from 8 million merged
    - the subsetting... probably isn't random
    - a lot of times, cops can't get accurate demographic data on criminals (they can get it more often on victims, though)
        - so our data is 'when cops were able to record demographic data (age, sex, ethnicity) of both parties, and when they recorded data about weapons'
            - another consideration: **does NaN in the weapons category mean they were unarmed** or that no weapon could be determined properly?
                - it would be irresponsible of me to assume that cops erroneously recorded unarmed as NaN, especially because the #1 value in the column is 'unarmed'
- 'uniqueness' of row is a victim
    - so essentially, we're predicting what sort of crime someone was the victim of, rather than just predicting some crime that occurred
        - this is a tradeoff of merging victim data with crime & criminal data
        - we'll see if it's worth it
    
    

In [49]:
# load data
df = pd.read_csv('josh.csv')
df = df.drop(['Unnamed: 0', 'state_y', 'state'], axis=1)
df = df.rename(columns={
    'state_x':'state',
})



In [30]:
df.columns

Index(['state', 'agency', 'case_number', 'UCR_code', 'attempt_or_complete',
       'DAC_1', 'location_type', 'weapon_type_1', 'bias_1', 'perp_seq_num',
       'perp_age', 'perp_sex', 'perp_race', 'perp_ethn', 'num_ori_matches_x',
       'victim_seq_num', 'ucr_1', 'vic_type', 'vic_age', 'vic_sex', 'vic_race',
       'vic_ethn', 'num_ori_matches_y'],
      dtype='object')

In [14]:
df.head(5)

Unnamed: 0,state,agency,case_number,UCR_code,attempt_or_complete,DAC_1,location_type,weapon_type_1,bias_1,perp_seq_num,...,perp_ethn,num_ori_matches_x,victim_seq_num,ucr_1,vic_type,vic_age,vic_sex,vic_race,vic_ethn,num_ori_matches_y
0,3,AR0010000,2-0QVS1MD I,13B,C,A,25,99,88,1,...,N,2.0,1,13B,I,30,F,W,N,1.0
1,3,AR0010000,2-BQ-51MD I,13B,C,N,20,99,88,1,...,N,1.0,1,13B,I,7,F,W,N,1.0
2,3,AR0010000,2-BQ-91MD I,13B,C,N,46,99,88,1,...,N,1.0,1,13B,I,30,M,W,N,1.0
3,3,AR0010000,1X-JP-H0HG1A,13B,C,N,15,99,88,1,...,U,2.0,1,13B,I,22,M,W,U,1.0
4,3,AR0010000,1X-MP-80HG1A,13B,C,N,25,99,88,1,...,H,1.0,1,13B,I,42,M,B,N,1.0


## variable questions


- is the victim's UCR (ucr_1) different from the crime's UCR_code?

In [20]:
t = df[df['UCR_code']==df['ucr_1']]
print(len(df),len(t))

364778 343600


- answer: maybe, yeah. so we'll keep it in
    - multiple victims for same crime, or multiple crimes for same victim

In [22]:
# target = df.UCR_code
# df.drop('UCR_code', axis=1, inplace=True)

In [50]:
# target.hist()

In [None]:
# d.head()

## Too many classification categories. let's bin

In [None]:

Assault = ['13A', '13B', '13C',]
Sex_Offense = ['11A', '11B', '11C', '11D', '36A', '36B',]
Financial_Fraud_Gambling = ['510', '250', '270', '210', '26A', '26B','26C', '26D', '26E', '39A', '39B', '39C', '39D']
Theft = ['220','23A','23B','23C','23D', '23E', '23F', '23G','23H', '240','120', '280']
Arson = ['200',]
Drugs = ['35A', '35B',]
Murder = ['09A']
Negligent_Manslaughter = ['09B']
Justifiable_Homicide = ['09C']
Abduction = ['100']
Obscene_Prostitution = ['370','40A','40B','40C',]
Weapon_Violation = ['520']

In [25]:
bins = {
    "Assault" : ['13A', '13B', '13C',],
    "Sex_Offense" : ['11A', '11B', '11C', '11D', '36A', '36B',],
    "Financial_Fraud_Gambling" : ['510', '250', '270', '210', '26A', '26B','26C', '26D', '26E', '39A', '39B', '39C', '39D'],
    "Theft" : ['220','23A','23B','23C','23D', '23E', '23F', '23G','23H', '240','120', '280'],
    "Arson" : ['200',],
    "Drugs" : ['35A', '35B',],
    "Murder" : ['09A'],
    "Negligent_Manslaughter" : ['09B'],
    "Justifiable_Homicide" : ['09C'],
    "Abduction" : ['100'],
    "Obscene_Prostitution" : ['370','40A','40B','40C',],
    "Weapon_Violation" : ['520'],
}

In [51]:
def cleaner(row): # if value in list, change value to that list's label
    entry = row['UCR_code']
    for key, value in bins.items():
        if entry in value:
            row['UCR_code'] = key
    return row


In [27]:
df.head()

Unnamed: 0,state,agency,case_number,attempt_or_complete,DAC_1,location_type,weapon_type_1,bias_1,perp_seq_num,perp_age,...,perp_ethn,num_ori_matches_x,victim_seq_num,ucr_1,vic_type,vic_age,vic_sex,vic_race,vic_ethn,num_ori_matches_y
0,3,AR0010000,2-0QVS1MD I,C,A,25,99,88,1,35.0,...,N,2.0,1,13B,I,30,F,W,N,1.0
1,3,AR0010000,2-BQ-51MD I,C,N,20,99,88,1,31.0,...,N,1.0,1,13B,I,7,F,W,N,1.0
2,3,AR0010000,2-BQ-91MD I,C,N,46,99,88,1,76.0,...,N,1.0,1,13B,I,30,M,W,N,1.0
3,3,AR0010000,1X-JP-H0HG1A,C,N,15,99,88,1,30.0,...,U,2.0,1,13B,I,22,M,W,U,1.0
4,3,AR0010000,1X-MP-80HG1A,C,N,25,99,88,1,65.0,...,H,1.0,1,13B,I,42,M,B,N,1.0


In [38]:
df['UCR_code'] = df['UCR_code'].apply(str)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364778 entries, 0 to 364777
Data columns (total 23 columns):
state                  364778 non-null int64
agency                 364778 non-null object
case_number            364778 non-null object
UCR_code               364778 non-null object
attempt_or_complete    364778 non-null object
DAC_1                  364778 non-null object
location_type          364778 non-null int64
weapon_type_1          364778 non-null object
bias_1                 364778 non-null int64
perp_seq_num           364778 non-null int64
perp_age               364778 non-null float64
perp_sex               364778 non-null object
perp_race              364778 non-null object
perp_ethn              364778 non-null object
num_ori_matches_x      364778 non-null float64
victim_seq_num         364778 non-null int64
ucr_1                  364778 non-null object
vic_type               364778 non-null object
vic_age                364778 non-null object
vic_sex           

In [52]:
df.apply(lambda row: cleaner(row), axis=1)

Unnamed: 0,state,agency,case_number,UCR_code,attempt_or_complete,DAC_1,location_type,weapon_type_1,bias_1,perp_seq_num,...,perp_ethn,num_ori_matches_x,victim_seq_num,ucr_1,vic_type,vic_age,vic_sex,vic_race,vic_ethn,num_ori_matches_y
0,3,AR0010000,2-0QVS1MD I,Assault,C,A,25,99,88,1,...,N,2.0,1,13B,I,30,F,W,N,1.0
1,3,AR0010000,2-BQ-51MD I,Assault,C,N,20,99,88,1,...,N,1.0,1,13B,I,07,F,W,N,1.0
2,3,AR0010000,2-BQ-91MD I,Assault,C,N,46,99,88,1,...,N,1.0,1,13B,I,30,M,W,N,1.0
3,3,AR0010000,1X-JP-H0HG1A,Assault,C,N,15,99,88,1,...,U,2.0,1,13B,I,22,M,W,U,1.0
4,3,AR0010000,1X-MP-80HG1A,Assault,C,N,25,99,88,1,...,H,1.0,1,13B,I,42,M,B,N,1.0
5,3,AR0010000,3S-04KO2J739,Assault,C,N,25,99,88,1,...,N,1.0,1,13A,I,39,M,W,N,2.0
6,3,AR0010000,7--NYB16IL72,Assault,C,N,20,12,88,1,...,N,1.0,1,13A,I,21,M,W,N,1.0
7,3,AR0010000,9C- KH48429G,Assault,C,A,20,90,88,1,...,U,1.0,1,13B,I,38,M,W,U,1.0
8,3,AR0010000,9C-WKO48429G,Assault,C,N,25,12,88,1,...,N,2.0,1,13A,I,24,M,B,N,2.0
9,3,AR0010000,9C-WKOE8429G,Assault,C,N,20,99,88,1,...,U,1.0,1,13B,I,55,F,W,U,1.0


In [53]:
df.head()

Unnamed: 0,state,agency,case_number,UCR_code,attempt_or_complete,DAC_1,location_type,weapon_type_1,bias_1,perp_seq_num,...,perp_ethn,num_ori_matches_x,victim_seq_num,ucr_1,vic_type,vic_age,vic_sex,vic_race,vic_ethn,num_ori_matches_y
0,3,AR0010000,2-0QVS1MD I,13B,C,A,25,99,88,1,...,N,2.0,1,13B,I,30,F,W,N,1.0
1,3,AR0010000,2-BQ-51MD I,13B,C,N,20,99,88,1,...,N,1.0,1,13B,I,7,F,W,N,1.0
2,3,AR0010000,2-BQ-91MD I,13B,C,N,46,99,88,1,...,N,1.0,1,13B,I,30,M,W,N,1.0
3,3,AR0010000,1X-JP-H0HG1A,13B,C,N,15,99,88,1,...,U,2.0,1,13B,I,22,M,W,U,1.0
4,3,AR0010000,1X-MP-80HG1A,13B,C,N,25,99,88,1,...,H,1.0,1,13B,I,42,M,B,N,1.0


In [54]:
df.to_csv('josh_translated.csv') # saving progress is good

In [55]:
target = df['UCR_code']

In [None]:
# samp = df.sample(1000, random_state = 1)
# tar = samp['UCR_code']
# tar.hist()

In [None]:
# target.hist() 
# okay, hist is taking a while, we'll just run some stuff
# and when our R is bad we'll realize the data isn't normal at all and we should have ran the hist

## wait, my data isn't dummies yet
### that's going to take a looooong time to run. maybe kill my kernel.
- i'll test it tonight. maybe want to subsample down to n=5000 or so to make sure it works.
    - i don't like cutting data, but it'll be easy to plug it back in when it's done


In [57]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = 0.25)

In [58]:
# create classifiers
adaboost_clf = AdaBoostClassifier()
gbt_clf = GradientBoostingClassifier()

In [59]:
# training time
adaboost_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

gbt_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

ValueError: could not convert string to float: 'WI0450100'