<center><h1>San Francisco Crime Classification - Featurization</h1></center>

### `import` Packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install mpu --quiet

[?25l[K     |████▊                           | 10 kB 25.9 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 26.4 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 31.4 MB/s eta 0:00:01[K     |██████████████████▉             | 40 kB 23.7 MB/s eta 0:00:01[K     |███████████████████████▌        | 51 kB 19.8 MB/s eta 0:00:01[K     |████████████████████████████▏   | 61 kB 15.4 MB/s eta 0:00:01[K     |████████████████████████████████| 69 kB 4.9 MB/s 
[?25h

In [3]:
import warnings
warnings.filterwarnings('ignore')

import json
import os
import pickle
import pandas as pd
import numpy as np

from mpu import haversine_distance
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer
)

### Data Reading

Reading `train.csv`, `test.csv`, and `sf-police-districts.shp` files

In [4]:
project_path = '/content/drive/MyDrive/AAIC/SCS-1/sf_crime_classification/'

In [5]:
train_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train.csv')
test_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/test.csv')

In [6]:
train_sf_df.shape, test_sf_df.shape

((878049, 9), (884262, 7))

Renaming columns

In [7]:
train_cols_renamed = ['time', 'category', 'description', 'weekday', 'police_dept', 
                      'resolution', 'address', 'longitude', 'latitude']
train_sf_df.columns = train_cols_renamed

test_cols_renamed = ['id', 'time', 'weekday', 'police_dept', 'address', 'longitude', 'latitude']
test_sf_df.columns = test_cols_renamed

Removing `description` and `reolution` column from `train_sf_df`

In [8]:
train_sf_df.drop(columns=['description', 'resolution'], axis=1, inplace=True)

In [9]:
train_sf_df.head(2)

Unnamed: 0,time,category,weekday,police_dept,address,longitude,latitude
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599


In [10]:
test_sf_df.head(2)

Unnamed: 0,id,time,weekday,police_dept,address,longitude,latitude
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432


### Time Manipulation

In [11]:
def extract_date(time):
    """Extract data from time"""
    return time.split(' ')[0]

def extract_year(date):
    """Extract year from date"""
    return int(date.split('-')[0])

def extract_month(date):
    """Extract month from date"""
    return int(date.split('-')[1])

def extract_day(date):
    """Extract day from date"""
    return int(date.split('-')[2])

def extract_hour(time):
    """Extract hour from time"""
    date, hms = time.split(' ')
    return int(hms.split(':')[0])

def extract_minute(time):
    """Extract minute from time"""
    date, hms = time.split(' ')
    return int(hms.split(':')[1])

def extract_season(month):
    """Determine season from month"""
    if month in [4, 5, 6]:
        return 'summer'
    elif month in [7, 8, 9]:
        return 'rainy'
    elif month in [10, 11, 12]:
        return 'winter'
    return 'spring'

def extract_hour_type(hour):
    """Determine hour type from hour"""
    if (hour >= 4) and (hour < 12):
        return 'morning'
    elif (hour >= 12) and (hour < 15):
        return 'noon'
    elif (hour >= 15) and (hour < 18):
        return 'evening'
    elif (hour >= 18) and (hour < 22):
        return 'night'
    return 'mid-night'

def extract_time_period(hour):
    """Determine the time period from hour"""
    if hour in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
        return 'am'
    return 'pm'

### Text Titling

In [12]:
def title_text(text):
    """Title the text"""
    if isinstance(text, str):
        text = text.title()
        return text
    return text

### Address Type (extraction)

In [13]:
def extract_address_type(addr):
    """Extract address type if it Street or Cross etc"""
    if ' / ' in addr:
        return 'Cross'
    addr_sep = addr.split(' ')
    addr_type = addr_sep[-1]
    return addr_type

### Writing Time Based Features

In [14]:
def write_temporal_address_features(df, path):
    """Writing the temporal based features"""
    
    ### Adding temporal features
    df['date'] = df['time'].apply(func=extract_date)
    df['year'] = df['date'].apply(func=extract_year)
    df['month'] = df['date'].apply(func=extract_month)
    df['day'] = df['date'].apply(func=extract_day)
    df['hour'] = df['time'].apply(func=extract_hour)
    df['minute'] = df['time'].apply(func=extract_minute)
    df['season'] = df['month'].apply(func=extract_season)
    df['hour_type'] = df['hour'].apply(func=extract_hour_type)
    df['time_period'] = df['hour'].apply(func=extract_time_period)
    
    ### Adding address type
    df['address_type'] = df['address'].apply(func=extract_address_type)
    
    ### Text titling
    df = df.applymap(func=title_text)
    
    ### Writing
    df.to_csv(path_or_buf=path, index=None)
    
    return True

In [15]:
if (
    not os.path.isfile(path=project_path + 'csv_files/train_time_address_cleaned.csv') and
    not os.path.isfile(path=project_path + 'csv_files/test_time_address_cleaned.csv')
   ):
    # Training
    write_temporal_address_features(df=train_sf_df, path=project_path + 'csv_files/train_time_address_cleaned.csv')
    # Test
    write_temporal_address_features(df=test_sf_df, path=project_path + 'csv_files/test_time_address_cleaned.csv')

else:
    print("Data already exists in the directory.")
    train_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train_time_address_cleaned.csv')
    test_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/test_time_address_cleaned.csv')

Data already exists in the directory.


### One-Hot-Encoding

Extracting time based features via OHE.

In [16]:
def split_categories_numericals(df):
    """Identifying the numerical and categorical columns separately"""
    cols = list(df.columns)
    num_cols = list(df._get_numeric_data().columns)
    cate_cols = list(set(cols) - set(num_cols))
    return cate_cols, num_cols

In [17]:
ignore_columns = ['category', 'time', 'address', 'date']

def extract_feature_dummies(df, column):
    """One-Hot-Encoding using Pandas"""
    col_df = df[column]
    return pd.get_dummies(data=col_df)

def encode_multiple_columns(df, ignore_columns=ignore_columns):
    """Encoding the multiple columns and vertical stacking them"""
    cate_cols, num_cols = split_categories_numericals(df=df)
    
    multi_feature_dummies = [df[num_cols]]
    for i in cate_cols:
        if i not in ignore_columns:
            d = extract_feature_dummies(df=df, column=i)
            multi_feature_dummies.append(d)

    encoded_data = pd.concat(multi_feature_dummies, axis=1)
    
    return encoded_data

### Extracting Spatial Distance Features

In [18]:
sf_pstations_tourists = {
    "sfpd"                : [37.7725, -122.3894],
    "ingleside"           : [37.7247, -122.4463],
    "central"             : [37.7986, -122.4101],
    "northern"            : [37.7802, -122.4324],
    "mission"             : [37.7628, -122.4220],
    "tenderloin"          : [37.7838, -122.4129],
    "taraval"             : [37.7437, -122.4815],
    "sfpd park"           : [37.7678, -122.4552],
    "bayview"             : [37.7298, -122.3977],
    "kma438 sfpd"         : [37.7725, -122.3894],
    "richmond"            : [37.7801, -122.4644],
    "police commission"   : [37.7725, -122.3894],
    "juvenile"            : [37.7632, -122.4220],
    "southern"            : [37.6556, -122.4366],
    "sfpd pistol range"   : [37.7200, -122.4996],
    "sfpd public affairs" : [37.7754, -122.4039],
    "broadmoor"           : [37.6927, -122.4748],
    #################
    "napa wine country"      : [38.2975, -122.2869],
    "sonoma wine country"    : [38.2919, -122.4580],
    "muir woods"             : [37.8970, -122.5811],
    "golden gate"            : [37.8199, -122.4783],
    "yosemite national park" : [37.865101, -119.538330],
}

In [19]:
def get_distance(ij):
    """Get distance from two coordinates"""
    i = ij[0]
    j = ij[1]
    distance = haversine_distance(origin=i, destination=j)
    return distance

def extract_spatial_distance_feature(df, lat_column, lon_column, pname, pcoords):
    """Compute the distance between pcoords and all the feature values"""
    lat_vals = df[lat_column].to_list()
    lon_vals = df[lon_column].to_list()
    
    df_coords = list(zip(lat_vals, lon_vals))
    pcoords_df_coords_combines = zip([pcoords] * len(df), df_coords)
    
    f = pd.DataFrame()
    distances = list(map(get_distance, pcoords_df_coords_combines))
    f[pname] = distances
    
    return f

In [20]:
def extract_spatial_distance_multi_features(df, lat_column, lon_column, stations=sf_pstations_tourists):
    """Compute the spatial distance for multiple features and vertical stacking them"""
    sfeatures = []
    
    for pname, pcoords in stations.items():
        # print(pname, pcoords)
        sf = extract_spatial_distance_feature(df, lat_column, lon_column, pname, pcoords)
        sfeatures.append(sf)
    
    spatial_distances = pd.concat(sfeatures, axis=1)
    return spatial_distances

### Extract Features only based on Latitudes and Longitudes

In [21]:
def lat_lon_sum(ll):
    """Return the sum of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return lat + lon

def lat_lon_diff(ll):
    """Return the diff of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return lon - lat

def lat_lon_sum_square(ll):
    """Return the square of sum of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lat + lon) ** 2

def lat_lon_diff_square(ll):
    """Return the square of diff of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lat - lon) ** 2

def lat_lon_sum_sqrt(ll):
    """Return the sqrt of sum of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lat**2 + lon**2) ** (1 / 2)

def lat_lon_diff_sqrt(ll):
    """Return the sqrt of diff of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lon**2 - lat**2) ** (1 / 2)

In [22]:
def features_by_lat_lon(df, lat_column, lon_column):
    """Compute all lat lon based features"""
    
    df_lats = df[lat_column].to_list()
    df_lons = df[lon_column].to_list()
    ll_zipped = list(zip(df_lats, df_lons))

    df_ll = pd.DataFrame()
    df_ll['lat_lon_sum'] = list(map(lat_lon_sum, ll_zipped))
    df_ll['lat_lon_diff'] = list(map(lat_lon_diff, ll_zipped))
    df_ll['lat_lon_sum_square'] = list(map(lat_lon_sum_square, ll_zipped))
    df_ll['lat_lon_diff_square'] = list(map(lat_lon_diff_square, ll_zipped))
    df_ll['lat_lon_sum_sqrt'] = list(map(lat_lon_sum_sqrt, ll_zipped))
    df_ll['lat_lon_diff_sqrt'] = list(map(lat_lon_diff_sqrt, ll_zipped))

    return df_ll

### BoW representation for Address

In [23]:
best_bow_columns = np.array([])

In [24]:
def create_bow_vectorizer(df, column, target='category', write_vect=True, kbest=20):
    """We should only fit on training data to avoid data leakage"""

    model_name = 'vect_bow_{}.pkl'.format(column)
    print(model_name)
    df_col_val = df[column]

    if not os.path.isfile(path=project_path + 'models/' + model_name):
        vect = CountVectorizer()
        vect.fit(raw_documents=df_col_val)
        pickle.dump(vect, open(project_path + 'models/' + model_name, "wb"))
        
    else:
        print("Model already exists in the directory.")
        vect = pickle.load(open(project_path + 'models/' + model_name, "rb"))
    
    df_col_features = vect.transform(raw_documents=df_col_val)
    global best_bow_columns
    
    if kbest:        
        if best_bow_columns.any():
            return pd.DataFrame(df_col_features[:, best_bow_columns].toarray(), columns=best_bow_columns)
        else:    
            fs = SelectKBest(k=kbest)
            fs.fit(df_col_features, df[target])
            df_col_features = fs.transform(df_col_features)
            best_bow_columns = fs.get_support(indices=True)
            return pd.DataFrame(df_col_features.toarray(), columns=best_bow_columns)

### TfIdf representation for Address

In [25]:
best_tfidf_cols = np.array([])

In [26]:
def create_tfidf_vectorizer(df, column, target='category', write_vect=True, kbest=20):
    """We should only fit on training data to avoid data leakage"""

    model_name = 'vect_tfidf_{}.pkl'.format(column)
    print(model_name)
    df_col_val = df[column]

    if not os.path.isfile(path=project_path + 'models/' + model_name):
        vect = TfidfVectorizer()
        vect.fit(raw_documents=df_col_val)
        pickle.dump(vect, open(project_path + 'models/' + model_name, "wb"))
    else:
        print("Model already exists in the directory.")
        vect = pickle.load(open(project_path + 'models/' + model_name, "rb"))
    
    df_col_features = vect.transform(raw_documents=df_col_val)
    global best_tfidf_cols

    if kbest:
        if best_tfidf_cols.any():
            return pd.DataFrame(df_col_features[:, best_tfidf_cols].toarray(), columns=best_tfidf_cols)
        else:
            fs = SelectKBest(k=kbest)
            fs.fit(df_col_features, df[target])
            df_col_features = fs.transform(df_col_features)
            best_tfidf_cols = fs.get_support(indices=True)
            return pd.DataFrame(df_col_features.toarray(), columns=best_tfidf_cols)

### Combing the data

* OHE data
* Spatial distance features
* Spatial latitude and longitude features
* Address BoW
* Address TfIdf

In [27]:
# temporal based features have already been written for both train and test datasets

def write_data_features(df, path, write_to_file=True):
    encoded_data = encode_multiple_columns(df=df)
    sd_features = extract_spatial_distance_multi_features(df=df, lat_column='latitude', lon_column='longitude')
    sll_features = features_by_lat_lon(df=df, lat_column='latitude', lon_column='longitude')
    address_bow = create_bow_vectorizer(df=df, column='address')
    address_tfidf = create_tfidf_vectorizer(df=df, column='address')
    sf_df_featurized = pd.concat([encoded_data, sd_features, sll_features, address_bow, address_tfidf], axis=1)

    if write_to_file:
        sf_df_featurized.to_csv(path_or_buf=path, index=None)
    
    return True

In [28]:
if (
    not os.path.isfile(path=project_path + 'csv_files/train_data_features.csv') and
    not os.path.isfile(path=project_path + 'csv_files/test_data_features.csv')
   ):
    # Training
    print("Train data")
    write_data_features(df=train_sf_df, path=project_path + 'csv_files/train_data_features.csv')
    print('-' * 30)
    # Test
    print("Test data")
    write_data_features(df=test_sf_df, path=project_path + 'csv_files/test_data_features.csv')
    print('-' * 30)

else:
    print("Data already exists in the directory.")

Data already exists in the directory.


### Featurized Data Reading

In [29]:
train_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/train_data_features.csv')
test_sf_df = pd.read_csv(filepath_or_buffer=project_path + 'csv_files/test_data_features.csv')

In [30]:
train_sf_df.head()

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,Bayview,Central,Ingleside,Mission,Northern,Park,Richmond,Southern,Taraval,Tenderloin,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Am,Pm,Rainy,Spring,Summer,Winter,Evening,Mid-Night,Morning,Night,Noon,/,Al,Av,Bl,Cr,...,17,236,328,421,718,869,940,1023,1078,1163,1178,1180,1392,1466,1500,1550,1582,1817,1854,1971,17.1,236.1,328.1,421.1,718.1,869.1,940.1,1023.1,1078.1,1163.1,1178.1,1180.1,1392.1,1466.1,1500.1,1550.1,1582.1,1817.1,1854.1,1971.1
0,-122.425892,37.774599,2015,5,13,23,53,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260309,0.0
1,-122.425892,37.774599,2015,5,13,23,53,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260309,0.0
2,-122.424363,37.800414,2015,5,13,23,33,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.260319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130387,0.0
3,-122.426995,37.800873,2015,5,13,23,30,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,0.0,0.15815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158147,0.0,0.0,0.0,0.0,0.144994,0.0
4,-122.438738,37.771541,2015,5,13,23,30,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,0.0,0.15805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158047,0.0,0.0,0.0,0.0,0.144902,0.0


In [31]:
test_sf_df.head()

Unnamed: 0,id,longitude,latitude,year,month,day,hour,minute,Bayview,Central,Ingleside,Mission,Northern,Park,Richmond,Southern,Taraval,Tenderloin,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Am,Pm,Rainy,Spring,Summer,Winter,Evening,Mid-Night,Morning,Night,Noon,/,Al,Av,Bl,...,17,236,328,421,718,869,940,1023,1078,1163,1178,1180,1392,1466,1500,1550,1582,1817,1854,1971,17.1,236.1,328.1,421.1,718.1,869.1,940.1,1023.1,1078.1,1163.1,1178.1,1180.1,1392.1,1466.1,1500.1,1550.1,1582.1,1817.1,1854.1,1971.1
0,0,-122.399588,37.735051,2015,5,10,23,59,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,...,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0,0.256337,0.140042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140039,0.0,0.0,0.0,0.0,0.0,0.0
1,1,-122.391523,37.732432,2015,5,10,23,51,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.274941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13771,0.0
2,2,-122.426002,37.792212,2015,5,10,23,50,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,0.0,0.15896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158957,0.0,0.0,0.0,0.0,0.145736,0.0
3,3,-122.437394,37.721412,2015,5,10,23,45,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,0.0,0.14883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148827,0.0,0.0,0.0,0.0,0.136449,0.0
4,4,-122.437394,37.721412,2015,5,10,23,45,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,0.0,0.14883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148827,0.0,0.0,0.0,0.0,0.136449,0.0
