## New York Police Department Crime Records Analysis 

### Exploratory Data Analysis

We import the dataset to clean and perform our exploratory data analysis - 

In [124]:
#Import the dataset containing 5 million records
import pandas as pd
import numpy as np

nRowsRead = 5012956
data = pd.read_csv('C:/Users/SwetaMankala/OneDrive - Northeastern University/Data Management and Big Data/NYPD_Arrests_Data__Historic_.csv', delimiter=',', nrows = nRowsRead, encoding='utf-8')

In [125]:
data.tail(10)
data.head(10)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,144026181,06/26/2015,639.0,AGGRAVATED HARASSMENT 2,361.0,OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV,PL 2403002,M,Q,102,0.0,45-64,M,WHITE HISPANIC,1031076.0,193779.0,40.69844,-73.83113,POINT (-73.83112953899997 40.69843969400005)
1,144507595,07/14/2015,969.0,"TRAFFIC,UNCLASSIFIED INFRACTION",881.0,OTHER TRAFFIC INFRACTION,VTL051101A,M,M,10,3.0,25-44,M,WHITE HISPANIC,984791.0,209846.0,40.742664,-73.998049,POINT (-73.99804910799998 40.74266360800004)
2,144565062,07/16/2015,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,K,90,0.0,18-24,F,WHITE HISPANIC,994026.0,195548.0,40.703414,-73.964743,POINT (-73.96474295699994 40.70341366900004)
3,144500188,07/14/2015,879.0,"ADM.CODE,UNCLASSIFIED VIOLATION",675.0,ADMINISTRATIVE CODE,AC 010125B,V,Q,103,0.0,25-44,M,WHITE HISPANIC,1037132.0,196129.0,40.704856,-73.809271,POINT (-73.809270971 40.70485576300007)
4,144216044,07/03/2015,478.0,"THEFT OF SERVICES, UNCLASSIFIED",343.0,OTHER OFFENSES RELATED TO THEFT,PL 1651503,M,M,10,1.0,18-24,M,WHITE HISPANIC,984602.0,210686.0,40.744969,-73.998731,POINT (-73.99873112099993 40.74496920800005)
5,144925030,07/30/2015,339.0,"LARCENY,PETIT FROM OPEN AREAS,UNCLASSIFIED",341.0,PETIT LARCENY,PL 1552500,M,B,45,0.0,18-24,M,BLACK,1030990.0,255310.0,40.867326,-73.831012,POINT (-73.83101160699994 40.86732605200007)
6,143984249,06/25/2015,849.0,"NY STATE LAWS,UNCLASSIFIED VIOLATION",677.0,OTHER STATE LAWS,LOC000000V,V,K,78,1.0,18-24,M,BLACK,991330.0,187303.0,40.680786,-73.974475,POINT (-73.97447511599997 40.68078561300007)
7,144541169,07/15/2015,203.0,"TRESPASS 3, CRIMINAL",352.0,CRIMINAL TRESPASS,PL 1401000,M,B,47,2.0,25-44,M,BLACK,1028017.0,262766.0,40.887806,-73.841712,POINT (-73.84171183999997 40.88780567700008)
8,144662834,07/20/2015,511.0,"CONTROLLED SUBSTANCE, POSSESSION 7",235.0,DANGEROUS DRUGS,PL 2200300,M,B,52,0.0,18-24,F,BLACK HISPANIC,1012786.0,254319.0,40.864684,-73.896833,POINT (-73.89683284499995 40.86468367600002)
9,144534233,07/15/2015,511.0,"CONTROLLED SUBSTANCE, POSSESSION 7",235.0,DANGEROUS DRUGS,PL 2200300,M,Q,115,0.0,25-44,M,BLACK,1018162.0,214190.0,40.754522,-73.877599,POINT (-73.87759928199995 40.754521787000044)


In [126]:
#to find the shape of the dataset
data.shape

(5012956, 19)

In [128]:
df = pd.DataFrame(data)

#list the number of null values present under each column
print(df.isnull().sum())

ARREST_KEY               0
ARREST_DATE              0
PD_CD                  261
PD_DESC               9029
KY_CD                 9029
OFNS_DESC             9029
LAW_CODE               196
LAW_CAT_CD           17472
ARREST_BORO              8
ARREST_PRECINCT          0
JURISDICTION_CODE       10
AGE_GROUP               17
PERP_SEX                 0
PERP_RACE                0
X_COORD_CD               1
Y_COORD_CD               1
Latitude                 1
Longitude                1
Lon_Lat                  1
dtype: int64


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5012956 entries, 0 to 5012955
Data columns (total 19 columns):
ARREST_KEY           int64
ARREST_DATE          object
PD_CD                float64
PD_DESC              object
KY_CD                float64
OFNS_DESC            object
LAW_CODE             object
LAW_CAT_CD           object
ARREST_BORO          object
ARREST_PRECINCT      int64
JURISDICTION_CODE    float64
AGE_GROUP            object
PERP_SEX             object
PERP_RACE            object
X_COORD_CD           float64
Y_COORD_CD           float64
Latitude             float64
Longitude            float64
Lon_Lat              object
dtypes: float64(7), int64(2), object(10)
memory usage: 726.7+ MB


### Data Preparation and Cleaning - 

In [69]:
#convert the column to datetime
df['ARREST_DATE'].dtype
df['ARREST_DATE'] = pd.to_datetime(df['ARREST_DATE'])

In [70]:
#Imputing values using median
median1 = df['PD_CD'].median()
df['PD_CD'].fillna(median1, inplace = True)

median2 = df['KY_CD'].median()
df['KY_CD'].fillna(median2, inplace = True)

In [71]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['race'] = pd.Series(len(df['PERP_RACE']), index=df.index)
df['race'] = 0

#To assign null values
df.loc[(df['PERP_RACE'] != 'BLACK') | 
           (df['PERP_RACE'] != 'WHITE HISPANIC') |
           (df['PERP_RACE'] != 'WHITE') |
           (df['PERP_RACE'] != 'BLACK HISPANIC') |
           (df['PERP_RACE'] != 'ASIAN/PAC.ISL') |
           (df['PERP_RACE'] != 'AMER IND') |
           (df['PERP_RACE'].isnull() == True), 'race'] = np.nan

#To assign the categorical values to the dataframe 'race'
df.loc[(df['PERP_RACE'] == 'BLACK') | 
           (df['PERP_RACE'] == 'WHITE HISPANIC') |
           (df['PERP_RACE'] == 'WHITE') |
           (df['PERP_RACE'] == 'BLACK HISPANIC') |
           (df['PERP_RACE'] == 'ASIAN/PAC.ISL') |
           (df['PERP_RACE'] == 'AMER IND'), 'race'] = df['PERP_RACE']

race_copy = df['race'].copy(deep = True)

# Fill NaN values.
df['race'].fillna(value = 1, inplace = True)

# Obtain values for every race.Axis=0 for rows
race_copy.dropna(axis = 0, inplace = True)
sorted_race = race_copy.value_counts(normalize = True).sort_index()

# Fill one values for individual person with randomly picked from random choice.
df['race'] = df['race'].apply(lambda x: np.random.choice([x for x in sorted_race.index],
                                replace = True, p = sorted_race) if (x == 1) else x).astype(str)

#Normalize=True prints the relative frequency of the values
print("\nFilled NaNs normalized:\n", df['race'].value_counts(normalize = True))

df['PERP_RACE'] = df['race']
df['PERP_RACE'].value_counts()


Filled NaNs normalized:
 BLACK             0.512880
WHITE HISPANIC    0.274659
WHITE             0.127521
BLACK HISPANIC    0.084940
Name: race, dtype: float64


BLACK             2571047
WHITE HISPANIC    1376854
WHITE              639256
BLACK HISPANIC     425799
Name: PERP_RACE, dtype: int64

In [72]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['sex'] = pd.Series(len(df['PERP_SEX']), index = df.index)
df['sex'] = 0

# Randomly stick sex to every user with NaN value.
df.loc[(df['PERP_SEX'] != 'M') | 
           (df['PERP_SEX'] != 'F') |
           (df['PERP_SEX'].isnull() == True), 'sex'] = np.nan
df.loc[(df['PERP_SEX'] == 'M') | 
           (df['PERP_SEX'] == 'F'), 'sex'] = df['PERP_SEX']


# Create a copy to calculate proportions.
sex_copy = df['sex'].copy(deep = True)

# Fill NaN values.
df['sex'].fillna(value = 1, inplace = True)

# Obtain values for every sex.
sex_copy.dropna(axis = 0, inplace = True)
sorted_sex = sex_copy.value_counts(normalize = True).sort_index()

# Fill one values in suspector_sex_rand with randomly picked from random choice.
df['sex'] = df['sex'].apply(lambda x: np.random.choice([x for x in sorted_sex.index],
                                replace = True, p = sorted_sex) if (x == 1) else x).astype(str)
print("Gender proportions after filled NaNs: \n", df['sex'].value_counts(normalize = True))

df['PERP_SEX'] = df['sex']
df['PERP_SEX'].value_counts()

Gender proportions after filled NaNs: 
 M    0.832157
F    0.167843
Name: sex, dtype: float64


M    4171564
F     841392
Name: PERP_SEX, dtype: int64

In [73]:
#Convert the object type variables to string
df['subject_sex'] = df['subject_sex'].astype(str)
df['subject_race'] = df['subject_race'].astype(str)
df['type'] = df['type'].astype(str)
df['LAW_CODE'] = df['LAW_CODE'].astype(str)
df['ARREST_BORO'] = df['ARREST_BORO'].astype(str)

In [74]:
#Segregate the values based on the categories, remove the nulls and normalize the data column
df['age'] = pd.Series(len(df['AGE_GROUP']), index = df.index)
df['age'] = 0

df.loc[(df['AGE_GROUP'] != '25-44') | 
           (df['AGE_GROUP'] != '18-24') |
           (df['AGE_GROUP'] != '45-64') |
           (df['AGE_GROUP'] != '65+') |
           (df['AGE_GROUP'] != '<18') |
           (df['AGE_GROUP'].isnull()), 'age'] = np.nan
df.loc[(df['AGE_GROUP'] == '25-44') | 
           (df['AGE_GROUP'] == '18-24') |
           (df['AGE_GROUP'] == '45-64') |
           (df['AGE_GROUP'] == '65+') |
           (df['AGE_GROUP'] == '<18'), 'age'] = df['AGE_GROUP']
age_copy = df['age'].copy(deep = True)

df['age'].fillna(value = 1, inplace = True)
age_copy.dropna(axis = 0, inplace = True)
sorted_age = age_copy.value_counts(normalize = True).sort_index()

df['age'] = df['age'].apply(lambda x: np.random.choice([x for x in sorted_age.index],
                               replace = True, p = sorted_age) if (x == 1) else x).astype(str)
print("Suspector age with filled NaNs normalized:\n", df['age'].value_counts(normalize = True))

df['AGE_GROUP'] = df['age']
df['AGE_GROUP'].value_counts()

Suspector age with filled NaNs normalized:
 25-44    0.462095
18-24    0.262886
45-64    0.184387
<18      0.082246
65+      0.008386
Name: age, dtype: float64


25-44    2316463
18-24    1317836
45-64     924324
<18       412294
65+        42039
Name: AGE_GROUP, dtype: int64

In [75]:
#Replace null values with code=5 to state non-NYPD jurisdictions
df['JURISDICTION_CODE'].fillna(5, inplace = True)

In [76]:
!pip install utm

Collecting utm
  Using cached utm-0.6.0.tar.gz (8.6 kB)
Building wheels for collected packages: utm
  Building wheel for utm (setup.py): started
  Building wheel for utm (setup.py): finished with status 'done'
  Created wheel for utm: filename=utm-0.6.0-py3-none-any.whl size=6099 sha256=7cabdca6a54f25db34991bc15be6d87e2d6438a3abb8e9999cee3adb864540fb
  Stored in directory: c:\users\swetamankala\appdata\local\pip\cache\wheels\70\ab\80\87b7abb2752e3c3fbadc8403fda7beb387d7ac6e55cbdf3c16
Successfully built utm
Installing collected packages: utm
Successfully installed utm-0.6.0


In [77]:
df['Latitude'].fillna(40.821054, inplace = True) #bronx
df['Longitude'].fillna(-73.893848, inplace = True)

lat_lon = '(' + df['Longitude'].astype(str) + ', ' + df['Latitude'].astype(str) + ')'   # It`s important to apply "(...).astype(str)" not "str(...)" below - I made this mistake
df['Lon_Lat'].fillna(value = lat_lon, axis = 0, inplace = True)

In [78]:
print(df.isnull().sum())

ARREST_KEY           0
ARREST_DATE          0
PD_CD                0
PD_DESC              0
KY_CD                0
OFNS_DESC            0
LAW_CODE             0
LAW_CAT_CD           0
ARREST_BORO          0
ARREST_PRECINCT      0
JURISDICTION_CODE    0
AGE_GROUP            0
PERP_SEX             0
PERP_RACE            0
X_COORD_CD           1
Y_COORD_CD           1
Latitude             0
Longitude            0
Lon_Lat              0
race                 0
sex                  0
age                  0
dtype: int64


#### Correlation Matrix

In [82]:
rs = np.random.RandomState(0)
#df = pd.DataFrame(rs.rand(10, 10))
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,ARREST_KEY,PD_CD,KY_CD,ARREST_PRECINCT,JURISDICTION_CODE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude
ARREST_KEY,1.0,-0.065639,-0.064885,0.014322,0.0125276,-0.00253095,-0.0181696,-0.0182903,-0.00275782
PD_CD,-0.065639,1.0,0.345509,0.00615705,0.007506,-0.0016599,0.0152557,0.0151866,-0.00136975
KY_CD,-0.064885,0.345509,1.0,-0.00271442,-0.0107495,-0.00203244,0.0103788,0.0103381,-0.00184734
ARREST_PRECINCT,0.014322,0.00615705,-0.00271442,1.0,-0.0329397,0.308263,-0.071481,-0.073009,0.30815
JURISDICTION_CODE,0.0125276,0.007506,-0.0107495,-0.0329397,1.0,-0.00131306,0.00040862,0.000449122,-0.001325
X_COORD_CD,-0.00253095,-0.0016599,-0.00203244,0.308263,-0.00131306,1.0,0.0689651,0.0695922,0.99984
Y_COORD_CD,-0.0181696,0.0152557,0.0103788,-0.071481,0.00040862,0.0689651,1.0,0.999964,0.0854333
Latitude,-0.0182903,0.0151866,0.0103381,-0.073009,0.000449122,0.0695922,0.999964,1.0,0.0860372
Longitude,-0.00275782,-0.00136975,-0.00184734,0.30815,-0.001325,0.99984,0.0854333,0.0860372,1.0


In [None]:
#sav the dataset to perform tableau analysis
df.to_csv(r'C:/Users/SwetaMankala/Desktop/NYPD_Arrests_Data_Clean.csv', index = False)

## Data Modeling

In [1]:
import pandas as pd

nRowsRead = 1000
data_clean = pd.read_csv('C:/Users/SwetaMankala/Desktop/Assignments/NYPD_Arrests_Data_Clean.csv', delimiter=',', nrows = nRowsRead, encoding='utf-8')

In [136]:
print(data_clean.isnull().sum())

ARREST_KEY           0
ARREST_DATE          0
PD_CD                0
PD_DESC              0
KY_CD                0
OFNS_DESC            0
LAW_CODE             1
LAW_CAT_CD           5
ARREST_BORO          0
ARREST_PRECINCT      0
JURISDICTION_CODE    0
AGE_GROUP            0
PERP_SEX             0
PERP_RACE            0
X_COORD_CD           0
Y_COORD_CD           0
Latitude             0
Longitude            0
Lon_Lat              0
race                 0
sex                  0
age                  0
Arrest               0
dtype: int64


In [2]:
data_clean.head(5)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,...,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat,race,sex,age,Arrest
0,144026181,6/26/2015,639,AGGRAVATED HARASSMENT 2,361,OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV,PL 2403002,M,Q,102,...,WHITE HISPANIC,1031076,193779,40.69844,-73.83113,POINT (-73.83112953899997 40.69843969400005),WHITE HISPANIC,M,45-64,0
1,144507595,7/14/2015,969,"TRAFFIC,UNCLASSIFIED INFRACTION",881,OTHER TRAFFIC INFRACTION,VTL051101A,M,M,10,...,WHITE HISPANIC,984791,209846,40.742664,-73.998049,POINT (-73.99804910799998 40.74266360800004),WHITE HISPANIC,M,25-44,0
2,144565062,7/16/2015,101,ASSAULT 3,344,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,K,90,...,WHITE HISPANIC,994026,195548,40.703414,-73.964743,POINT (-73.96474295699994 40.70341366900004),WHITE HISPANIC,F,18-24,1
3,144500188,7/14/2015,879,"ADM.CODE,UNCLASSIFIED VIOLATION",675,ADMINISTRATIVE CODE,AC 010125B,V,Q,103,...,WHITE HISPANIC,1037132,196129,40.704856,-73.809271,POINT (-73.809270971 40.70485576300007),WHITE HISPANIC,M,25-44,1
4,144216044,7/3/2015,478,"THEFT OF SERVICES, UNCLASSIFIED",343,OTHER OFFENSES RELATED TO THEFT,PL 1651503,M,M,10,...,WHITE HISPANIC,984602,210686,40.744969,-73.998731,POINT (-73.99873112099993 40.74496920800005),WHITE HISPANIC,M,18-24,1


### 1. Spark Analysis

In [84]:
import numpy as np
import timeit                                                               
import csv
import pandas

from pyspark.context import SparkContext, SparkConf
from pyspark import SQLContext
from pyspark.sql.session import SparkSession

In [85]:
sc = SparkContext('local')

spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [86]:
file = sc.textFile('C:/Users/SwetaMankala/Desktop/Assignments/NYPD_Arrests_Data_Clean.csv')
crime_data = file.map(lambda line: next(csv.reader(line.splitlines(), skipinitialspace=True)))

In [87]:
header = crime_data.first()
data = crime_data.filter(lambda line: line != header)
crime_df = sqlContext.createDataFrame(data, header)
crime_df.registerTempTable("crime_data")

In [142]:
# For each borough, count each kind of law category (e.g. felony)
sqlContext.sql("SELECT LAW_CAT_CD, COUNT(*) AS CNT FROM crime_data GROUP BY LAW_CAT_CD ORDER BY CNT desc").show()

+----------+------+
|LAW_CAT_CD|   CNT|
+----------+------+
|         M|709836|
|         F|252717|
|         V| 77725|
|         I|  5342|
|       nan|  2955|
+----------+------+



In [134]:
# For each borough, count each type of crime
sqlContext.sql("SELECT DISTINCT ARREST_BORO, COUNT(*) AS CNT FROM crime_data GROUP BY ARREST_BORO ORDER BY CNT desc").show()

+-----------+------+
|ARREST_BORO|   CNT|
+-----------+------+
|          K|294348|
|          M|290046|
|          B|233175|
|          Q|196556|
|          S| 34449|
|        nan|     1|
+-----------+------+



### 2. Clustering crimes based on demographics

In [91]:
latlnt = sqlContext.sql("SELECT ARREST_KEY,Latitude,Longitude FROM crime_data WHERE Latitude != '' AND Longitude != '' ")  

In [92]:
X = np.array(latlnt.select(latlnt.Latitude,latlnt.Longitude).collect())

In [93]:
from sklearn.cluster import KMeans

def k_means():
     start = timeit.default_timer()
     kmeans = KMeans(n_clusters=100, random_state=0).fit(X)
     # print(kmeans.cluster_centers_)
     stop = timeit.default_timer()
     print(stop - start)
     return kmeans

In [94]:
kmeans = k_means()
kmeans_labels = kmeans.labels_
kmeans_cluster_centers = kmeans.cluster_centers_
np.savetxt('kmeans_labels.out', kmeans_labels, delimiter=',')   # X is an array
np.savetxt('kmeans_cluster_centers.out', kmeans_cluster_centers, delimiter=',')   # X is an array

150.32559910000418


In [95]:
kmeans_cluster_centers.shape

(100, 2)

### 3. Data Processing with Pipeline

In [96]:
cutoff = round(len(data_clean)*9/10, 0)

train = data_clean.loc[:cutoff]
test = data_clean.loc[cutoff:]

In [97]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train, test_size=0.1, shuffle=True, random_state=23)

In [113]:
numerical_features = ['PD_CD', 'KY_CD', 'Latitude', 'Longitude']
text_features = ['PD_DESC', 'OFNS_DESC', 'PERP_RACE']
model_features = numerical_features + text_features

model_target = 'Arrest'

In [114]:
train_data[text_features] = train_data[text_features].astype('str')
test[text_features] = test[text_features].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [115]:
# Prepare cleaning functions
import re, string
import nltk
from nltk.stem import SnowballStemmer

stop_words = ["a", "an", "the", "this", "that", "is", "it", "to", "and"]

stemmer = SnowballStemmer('english')

def preProcessText(text):
    # lowercase and strip leading/trailing white space
    text = text.lower().strip()
    
    # remove HTML tags
    text = re.compile('<.*?>').sub('', text)
    
    # remove punctuation
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    
    # remove extra white space
    text = re.sub('\s+', ' ', text)
    
    return text

def lexiconProcess(text, stop_words, stemmer):
    filtered_sentence = []
    words = text.split(" ")
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(stemmer.stem(w))
    text = " ".join(filtered_sentence)
    
    return text

def cleanSentence(text, stop_words, stemmer):
    return lexiconProcess(preProcessText(text), stop_words, stemmer)

# Clean the text features
for c in text_features:
    print('Text cleaning: ', c)
    train[c] = [cleanSentence(item, stop_words, stemmer) for item in train[c].values]
    test[c] = [cleanSentence(item, stop_words, stemmer) for item in test[c].values]

Text cleaning:  PD_DESC
Text cleaning:  OFNS_DESC
Text cleaning:  PERP_RACE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [101]:
!pip install --upgrade scikit-learn

Requirement already up-to-date: scikit-learn in c:\users\swetamankala\anaconda3\lib\site-packages (0.23.2)


In [102]:
from sklearn.pipeline import Pipeline

In [116]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

numerical_processor = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean'))
#    ('num_scaler', MinMaxScaler()) # Shown in case is needed, not a must with Decision Trees
                                ])
# Preprocess 1st text feature
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=50))
                                ])

# Preprocess 2nd text feature (larger vocabulary)
text_precessor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=50))
                                ])

# Preprocess 3rd text feature (larger vocabulary)
text_precessor_2 = Pipeline([
    ('text_vect_2', CountVectorizer(binary=True, max_features=50))
                                ])

data_preprocessor = ColumnTransformer([
    ('numerical_pre', numerical_processor, numerical_features),
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_precessor_1, text_features[1]),
    ('text_pre_2', text_precessor_2, text_features[2])
                                    ]) 
    
pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('dt', RandomForestClassifier())
                    ])
    
from sklearn import set_config
set_config()#display='diagram')
pipeline

Pipeline(steps=[('data_preprocessing',
                 ColumnTransformer(transformers=[('numerical_pre',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer())]),
                                                  ['PD_CD', 'KY_CD', 'Latitude',
                                                   'Longitude']),
                                                 ('text_pre_0',
                                                  Pipeline(steps=[('text_vect_0',
                                                                   CountVectorizer(binary=True,
                                                                                   max_features=50))]),
                                                  'PD_DESC'),
                                                 ('text_pre_1',
                                                  Pipeline(steps=[('text_vect_1',
                        

In [117]:
X_train = train[model_features]
y_train = train[model_target]

pipeline.fit(X_train, y_train)

Pipeline(steps=[('data_preprocessing',
                 ColumnTransformer(transformers=[('numerical_pre',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer())]),
                                                  ['PD_CD', 'KY_CD', 'Latitude',
                                                   'Longitude']),
                                                 ('text_pre_0',
                                                  Pipeline(steps=[('text_vect_0',
                                                                   CountVectorizer(binary=True,
                                                                                   max_features=50))]),
                                                  'PD_DESC'),
                                                 ('text_pre_1',
                                                  Pipeline(steps=[('text_vect_1',
                        

In [118]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted model to make predictions on the train dataset
# Train data going through the Pipeline it's first imputed (with means from the train), scaled (with the min/max 
#from the train data), and finally used to make predictions
train_predictions = pipeline.predict(X_train)

print('Model performance on the train set:')
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))
print("Train accuracy:", accuracy_score(y_train, train_predictions))

# Use the fitted model to make predictions on the validation dataset
X_val = val_data[model_features]
y_val  = val_data[model_target]

val_predictions = pipeline.predict(X_val)

print('Model performance on the validation set:')
print(confusion_matrix(y_val, val_predictions))
print(classification_report(y_val, val_predictions))
print("Validation accuracy:", accuracy_score(y_val, val_predictions))

Model performance on the train set:
[[435   6]
 [  6 454]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       441
           1       0.99      0.99      0.99       460

    accuracy                           0.99       901
   macro avg       0.99      0.99      0.99       901
weighted avg       0.99      0.99      0.99       901

Train accuracy: 0.9866814650388457
Model performance on the validation set:
[[42  1]
 [ 2 46]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.98      0.96      0.97        48

    accuracy                           0.97        91
   macro avg       0.97      0.97      0.97        91
weighted avg       0.97      0.97      0.97        91

Validation accuracy: 0.967032967032967


In [143]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### PIPELINE GRID_SEARCH ###
############################

# Parameter grid for GridSearch
param_grid={'dt__max_depth': [10, 20, 30],#, 50, 75, 100, 125, 150, 200, 250], 
            'dt__min_samples_leaf': [1, 2, 5],#, 25, 30],
            'dt__min_samples_split': [10, 20, 30]#, 25, 30, 45, 50]
           }

grid_search = GridSearchCV(pipeline, # Base model
                           param_grid, # Parameters to try
                           cv = 5, # Apply 5-fold cross validation
                           verbose = 1, # Print summary
                           n_jobs = -1 # Use all available processors
                          )

# Fit the GridSearch to our training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   14.1s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('data_preprocessing',
                                        ColumnTransformer(transformers=[('numerical_pre',
                                                                         Pipeline(steps=[('num_imputer',
                                                                                          SimpleImputer())]),
                                                                         ['PD_CD',
                                                                          'KY_CD',
                                                                          'Latitude',
                                                                          'Longitude']),
                                                                        ('text_pre_0',
                                                                         Pipeline(steps=[('text_vect_0',
                                                                                 

In [120]:
print(grid_search.best_params_)
print(grid_search.best_score_)

# Get the best model out of GridSearchCV
classifier = grid_search.best_estimator_

# Fit the best model to the train data once more
classifier.fit(X_train, y_train)

{'dt__min_samples_split': 10, 'dt__min_samples_leaf': 1, 'dt__max_depth': 30}
0.50719459791283


Pipeline(steps=[('data_preprocessing',
                 ColumnTransformer(transformers=[('numerical_pre',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer())]),
                                                  ['PD_CD', 'KY_CD', 'Latitude',
                                                   'Longitude']),
                                                 ('text_pre_0',
                                                  Pipeline(steps=[('text_vect_0',
                                                                   CountVectorizer(binary=True,
                                                                                   max_features=50))]),
                                                  'PD_DESC'),
                                                 ('text_pre_1',
                                                  Pipeline(steps=[('text_vect_1',
                        

### 4. KNN Classification

In [144]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


all_X = train[numerical_features]
all_y = train[model_target]

# grid = GridSearchCV(knn, param_grid = hyperparameters, cv=10)

best_knn = KNeighborsClassifier(n_neighbors = 9, weights='distance', algorithm='brute', p=1)
best_knn.fit(all_X, all_y)

KNeighborsClassifier(algorithm='brute', n_neighbors=9, p=1, weights='distance')

In [149]:
train_predictions = best_knn.predict(train[numerical_features])

In [150]:
print('Model performance on the train set:')
print(confusion_matrix(all_y, train_predictions))
print(classification_report(all_y, train_predictions))
print("Train accuracy:", accuracy_score(all_y, train_predictions))

Model performance on the train set:
[[435   6]
 [ 13 447]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       441
           1       0.99      0.97      0.98       460

    accuracy                           0.98       901
   macro avg       0.98      0.98      0.98       901
weighted avg       0.98      0.98      0.98       901

Train accuracy: 0.978912319644839


In [151]:
val_predictions = best_knn.predict(X_val[numerical_features])

In [152]:
print('Model performance on the validation set:')
print(confusion_matrix(y_val, val_predictions))
print(classification_report(y_val, val_predictions))
print("Validation accuracy:", accuracy_score(y_val, val_predictions))

Model performance on the validation set:
[[42  1]
 [ 1 47]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.98      0.98      0.98        48

    accuracy                           0.98        91
   macro avg       0.98      0.98      0.98        91
weighted avg       0.98      0.98      0.98        91

Validation accuracy: 0.978021978021978
