# Feature Selection Notebook
&nbsp;
### Joe Bobby: ME17B016
### UMA T V: ME17B170
### Omkar Sunil Nath: ME17B158

In [1]:
# import necessary libraries

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool


In [2]:
# install new ones

pip install gcsfs

Note: you may need to restart the kernel to use updated packages.


In [3]:
# reading data
data = pd.read_csv("gs://bdl2021_final_project/nyc_tickets_train.csv/part-00000-743d223e-bd0d-40df-8f2d-52793b171b69-c000.csv")

In [4]:
# view the data
data

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Hydrant Violation,Double Parking Violation,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,8002531292,EPC5238,NY,PAS,10/01/2014,21,SUBN,CHEVR,T,20390,...,,,,,,,,,,
1,8015318440,5298MD,NY,COM,03/06/2015,14,VAN,FRUEH,T,27790,...,,,,,,,,,,
2,7611181981,FYW2775,NY,PAS,07/28/2014,46,SUBN,SUBAR,T,8130,...,,,,,,,,,,
3,7445908067,GWE1987,NY,PAS,04/13/2015,19,4DSD,LEXUS,T,59990,...,,,,,,,,,,
4,7037692864,T671196C,NY,PAS,05/19/2015,19,4DSD,CHRYS,T,36090,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533559,8006834040,GST7309,NY,PAS,12/21/2014,40,4DSD,BMW,T,34770,...,,,,,,,,,,
533560,7526350019,GRS3910,NY,PAS,12/09/2014,21,4DSD,VOLVO,T,60440,...,,,,,,,,,,
533561,7061061893,XZ132X,NJ,PAS,05/19/2015,47,DELV,FRUEH,T,34750,...,,,,,,,,,,
533562,7436338183,DGL7026,NY,PAS,12/20/2014,20,4DSD,NISSA,T,24940,...,,,,,,,,,,


In [5]:
data['Violation_County']

0         NY
1         NY
2          K
3          Q
4         NY
          ..
533559    NY
533560     K
533561    NY
533562     Q
533563    NY
Name: Violation_County, Length: 533564, dtype: object

In [6]:
features_list = ['Summons Number',
 'Plate ID',
 'Registration State',
 'Plate Type',
 'Issue Date',
 'Violation Code',
 'Vehicle Body Type',
 'Vehicle Make',
 'Issuing Agency',
 'Street Code1',
 'Street Code2',
 'Street Code3',
 'Vehicle Expiration Date',
 'Issuer Code',
 'Issuer Command',
 'Issuer Squad',
 'Violation Time',
 'Time First Observed',
 'Violation_County',
 'Violation In Front Of Or Opposite',
 'House Number',
 'Street Name',
 'Intersecting Street',
 'Date First Observed',
 'Law Section',
 'Sub Division',
 'Violation Legal Code',
 'Days Parking In Effect',
 'From Hours In Effect',
 'To Hours In Effect',
 'Vehicle Color',
 'Unregistered Vehicle?',
 'Vehicle Year',
 'Meter Number',
 'Feet From Curb',
 'Violation Post Code',
 'Violation Description',
 'No Standing or Stopping Violation',
 'Hydrant Violation',
 'Double Parking Violation',
 'Latitude',
 'Longitude',
 'Community Board',
 'Community Council',
 'Census Tract',
 'BIN',
 'BBL',
 'NTA']

In [7]:
len(features_list)

48

In [8]:
# take a part of the data
df = data[:1000000]

In [10]:
df = df.astype(str)

In [11]:
extra_features = []

for col in features_list:
    if col not in data.columns and col != 'Violation County':
        extra_features.append(col)

In [13]:
to_be_removed = []

for col in df.columns:
    if col not in features_list and col != 'Violation County':
        to_be_removed.append(col)
        

In [15]:
df = df.drop(columns = to_be_removed)

In [16]:
len(df.columns)

48

In [17]:
threshold = 0.4
#Dropping columns with missing value rate higher than threshold
df = df[df.columns[df.isnull().mean() < threshold]]

#Dropping rows with missing value rate higher than threshold
df = df.loc[df.isnull().mean(axis=1) < threshold]

In [18]:
# Filling na
for col in df.columns:
    df[col].fillna(df[col].value_counts().idxmax(), inplace=True)

In [21]:
# take a part of the data
X = df.drop(columns = ['Violation_County'])
y = df['Violation_County']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]


In [24]:
X_train

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Hydrant Violation,Double Parking Violation,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
239604,7704789820,J031109,NJ,PAS,11/04/2014,38,SUBN,JEEP,T,5580,...,,,,,,,,,,
522337,7698324011,DYC5791,NY,PAS,03/19/2015,70,SUBN,TOYOT,T,10730,...,,,,,,,,,,
320860,7421179148,GFZ2810,NY,PAS,07/24/2014,46,SUBN,FORD,T,42820,...,,,,,,,,,,
94489,1376919035,FZW7135,NY,PAS,11/19/2014,19,VAN,TOYOT,P,0,...,,,,,,,,,,
59058,8000755865,GLL5721,NY,PAS,02/07/2015,71,SUBN,CHEVR,T,18040,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,7004228289,GKB5944,NY,PAS,09/17/2014,51,SUBN,CHEVR,T,9140,...,,,,,,,,,,
259178,7317541345,GMZ4379,NY,PAS,11/01/2014,14,SUBN,DODGE,T,42820,...,,,,,,,,,,
365838,7039870381,21706MD,NY,COM,05/01/2015,16,DELV,GMC,T,34090,...,,,,,,,,,,
131932,1377664648,BZJ4430,NY,PAS,03/01/2015,20,SDN,TOYOT,P,10810,...,,,,,,,,,,


In [25]:
# verifying the nan removal/replacement
X_train.isna().sum()

Summons Number                       0
Plate ID                             0
Registration State                   0
Plate Type                           0
Issue Date                           0
Violation Code                       0
Vehicle Body Type                    0
Vehicle Make                         0
Issuing Agency                       0
Street Code1                         0
Street Code2                         0
Street Code3                         0
Vehicle Expiration Date              0
Issuer Code                          0
Issuer Command                       0
Issuer Squad                         0
Violation Time                       0
Time First Observed                  0
Violation In Front Of Or Opposite    0
House Number                         0
Street Name                          0
Intersecting Street                  0
Date First Observed                  0
Law Section                          0
Sub Division                         0
Violation Legal Code     

In [26]:
X_train.dtypes

Summons Number                       object
Plate ID                             object
Registration State                   object
Plate Type                           object
Issue Date                           object
Violation Code                       object
Vehicle Body Type                    object
Vehicle Make                         object
Issuing Agency                       object
Street Code1                         object
Street Code2                         object
Street Code3                         object
Vehicle Expiration Date              object
Issuer Code                          object
Issuer Command                       object
Issuer Squad                         object
Violation Time                       object
Time First Observed                  object
Violation In Front Of Or Opposite    object
House Number                         object
Street Name                          object
Intersecting Street                  object
Date First Observed             

In [28]:

clf = CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
    #loss_function='CrossEntropy'
)


clf.fit(X_train, y_train, 
        cat_features=cat_features, 
        eval_set=(X_test, y_test), 
        verbose=False
)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())

CatBoost model is fitted: True
CatBoost model parameters:
{'iterations': 5, 'learning_rate': 0.1}


In [31]:
# extracting dimensions of dataframe
m = len(X_train)
n = len(X_train.columns)

In [32]:
train_data = Pool(X_train.to_numpy().resize(m,n), y_train.to_numpy().resize(m,1))

In [None]:
clf.get_feature_importance(data = train_data, type= "LossFunctionChange")

In [34]:
X_train.columns

Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Issuer Code', 'Issuer Command',
       'Issuer Squad', 'Violation Time', 'Time First Observed',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation', 'Latitude', 'Longitude', 'Community Board',
       'Community Council', 'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='o

In [35]:
feature_importance = pd.DataFrame()
feature_importance['feature_name'] = X_train.columns
feature_importance['importance'] = clf.get_feature_importance(data = train_data, type= "PredictionValuesChange")

In [36]:
feature_importance

Unnamed: 0,feature_name,importance
0,Summons Number,0.0
1,Plate ID,0.0
2,Registration State,0.0
3,Plate Type,0.0
4,Issue Date,0.0
5,Violation Code,0.0
6,Vehicle Body Type,0.0
7,Vehicle Make,0.0
8,Issuing Agency,0.0
9,Street Code1,0.0


In [245]:
# writing the data locally
feature_importance.to_csv('Feature_Importance_on_100000_rows.csv')

In [37]:
# using a threshold of 0 to remove the non essential features
feature_list =[]
for i in feature_importance.index:
    if(feature_importance['importance'][i]>0):
        feature_list.append(feature_importance['feature_name'][i])

In [38]:
# Finally selected features
feature_list

['Issuer Command',
 'Days Parking In Effect',
 'From Hours In Effect',
 'Vehicle Year']