In [216]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import HashingVectorizer

In [217]:
data = pd.read_csv("./data/parking_citations.corrupted.csv")

In [218]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1) 
       
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [219]:
missing_values_table(data)

Unnamed: 0,Missing Values,% of Total Values
VIN,8709705,99.8
Marked Time,8435415,96.7
Meter Id,6456512,74.0
Make,4368470,50.1
Plate Expiry Date,794827,9.1
Route,65354,0.7
Body Style,8890,0.1
Fine amount,6507,0.1
Color,4115,0.0
Issue time,2583,0.0


In [220]:
make = data['Make']
data = data.drop(['Make'], axis=1)

In [151]:
data[data["VIN"].isna()==False]

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Body Style,Color,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude
1235876,4241125932,2014-10-31T00:00:00,805.0,,,CA,,1ZVFT80N465228314,PA,BK,7045 LINDLEY AVE,00374,53.0,80.69BS,NO PARK/STREET CLEAN,73.0,6402218.1,1894969.7
1235877,4241125943,2014-10-31T00:00:00,807.0,,,CA,201509.0,0199,PA,SL,6941 LINDLEY AVE,00374,53.0,80.69BS,NO PARK/STREET CLEAN,73.0,6402214.7,1894293.0
1235878,4241125954,2014-10-31T00:00:00,809.0,,,CA,201502.0,3491,PA,BK,6621 LINDLEY AVE,00374,53.0,80.69BS,NO PARK/STREET CLEAN,73.0,6402205.6,1892169.7
1235879,4241125965,2014-10-31T00:00:00,814.0,,,CA,,1G1ZS58F47F170546,PA,SL,6430 RESEDA BLVD,00374,53.0,80.69BS,NO PARK/STREET CLEAN,73.0,6399558.0,1890949.9
1235880,4241125976,2014-10-31T00:00:00,815.0,,,CA,201412.0,9817,PA,GN,6430 RESEDA BLVD,00374,53.0,80.69BS,NO PARK/STREET CLEAN,73.0,6399558.0,1890949.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1252333,4247320195,2014-11-03T00:00:00,1110.0,CB3067A,,CA,,1N4AL3AP2EN224587,PA,GY,355 4TH ST E,00667,56.0,88.13B+,METER EXP.,63.0,6485950.2,1840638.1
1252334,4247320206,2014-11-03T00:00:00,1131.0,,,CA,11.0,NV,VN,BN,300 LOS ANGELES ST S,00667,56.0,80.56E4+,RED ZONE,93.0,6487537.5,1840128.2
1252335,4247320221,2014-11-03T00:00:00,1153.0,LT288,,CA,201509.0,7870,PA,GY,471 3RD ST E,00667,56.0,88.13B+,METER EXP.,63.0,6504955.2,1738919.6
1252336,4247320232,2014-11-03T00:00:00,1209.0,LT372A,,CA,201503.0,9425,PA,SL,323 BOYD ST,00667,56.0,88.13B+,METER EXP.,63.0,6487783.2,1839600.4


In [221]:
vin = data["VIN"]
data["VIN"].fillna("NV", inplace=True)

In [222]:
marked_time = data["Marked Time"]
data["Marked Time"].fillna(0, inplace=True)

In [223]:
meter_id = data["Meter Id"]
data["Meter Id"].fillna("!!", inplace=True)

In [224]:
plate_expiry_date = data["Plate Expiry Date"]
data["Plate Expiry Date"].fillna(0, inplace=True)

In [225]:
route = data["Route"]
data["Route"].fillna("00600", inplace=True)

In [226]:
body_style = data["Body Style"]
data['Body Style'].fillna("PA", inplace=True)

In [227]:
fine_amount = data["Fine amount"]
data["Fine amount"].fillna(data["Fine amount"].mean(), inplace=True)

In [228]:
color = data["Color"]
data["Color"].fillna("BK", inplace=True)

In [229]:
issue_time = data["Issue time"]
data["Issue time"].fillna(data["Issue time"].mean(), inplace=True)

In [230]:
Violation_Description = data["Violation Description"]
data["Violation Description"]=data["Violation Description"].fillna("NO DESCRIPTION")

In [231]:
location = data["Location"]
data["Location"].fillna("NO LOCATION", inplace=True)

In [232]:
RP_state_plate = data["RP State Plate"]
data["RP State Plate"].fillna("CA", inplace=True)

In [233]:
agency = data["Agency"]
data["Agency"].fillna(data["Agency"].mean(), inplace=True)

In [234]:
issue_date = data["Issue Date"]
data["Issue Date"].fillna("2016-01-19T00:00:00", inplace=True)

In [235]:
latitude = data["Latitude"]
longitude = data["Longitude"]
data["Latitude"].fillna("99999.0", inplace=True)
data["Longitude"].fillna("99999.0", inplace=True)

In [236]:
data.dtypes

Ticket number             object
Issue Date                object
Issue time               float64
Meter Id                  object
Marked Time              float64
RP State Plate            object
Plate Expiry Date        float64
VIN                       object
Body Style                object
Color                     object
Location                  object
Route                     object
Agency                   float64
Violation code            object
Violation Description     object
Fine amount              float64
Latitude                  object
Longitude                 object
dtype: object

In [237]:
le = LabelEncoder()
data["Meter Id"] = le.fit_transform(data["Meter Id"])
data["RP State Plate"] = le.fit_transform(data["RP State Plate"])
data["VIN"] = le.fit_transform(data["VIN"])
data["Body Style"] = le.fit_transform(data["Body Style"])
data["Color"] = le.fit_transform(data["Color"])
data["Route"] = le.fit_transform(data["Route"])
data["Violation code"] = le.fit_transform(data["Violation code"])
data["Violation Description"] = le.fit_transform(data["Violation Description"])
data["Location"] = le.fit_transform(data["Location"])

In [238]:
data["Ticket number"] = data["Ticket number"].apply(lambda x: str(x))

In [239]:
data["Ticket number"] = le.fit_transform(data["Ticket number"])

In [240]:
data["Issue Date"] = pd.to_datetime(data["Issue Date"])

In [241]:
data["Issue Date"] = le.fit_transform(data["Issue Date"])

In [242]:
data.dtypes

Ticket number              int64
Issue Date                 int64
Issue time               float64
Meter Id                   int64
Marked Time              float64
RP State Plate             int64
Plate Expiry Date        float64
VIN                        int64
Body Style                 int64
Color                      int64
Location                   int64
Route                      int64
Agency                   float64
Violation code             int64
Violation Description      int64
Fine amount              float64
Latitude                  object
Longitude                 object
dtype: object

In [243]:
data["Latitude"] = data["Latitude"].apply(lambda x: float(x))
data["Longitude"] = data["Longitude"].apply(lambda x: float(x))

In [245]:
clusters = 3
while clusters < 20:
    k = KMeans(n_clusters = clusters, n_jobs = -1)
    preds = k.fit_predict(data.values)
    sil = silhouette_score(data.values, preds)
    print(clusters, sil)
    clusters +=1

MemoryError: 

In [181]:
marked_time.dtype

dtype('O')