In this project a dataset containing crime statistics in the San Francisco area from X to Y is analyzed with the goal of creating a model to predict the nature of a crime based only on its time and location. 

More information on the project and its dataset can be found here:
https://www.kaggle.com/c/sf-crime

In [1]:
%matplotlib inline

import sys, os
from datetime import datetime
import numpy as np
import pandas
import matplotlib.pyplot as plt

from pandas.tseries.holiday import USFederalHolidayCalendar as holidaysCalendar
from sklearn.neighbors import KNeighborsClassifier

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import tensorflow as tf

pandas.options.display.max_columns = 200

In [2]:
# Globals
WRITE = False

# Define header (from sample submission)
header = ["ID", "ARSON", "ASSAULT", "BAD CHECKS", "BRIBERY", "BURGLARY",
          "DISORDERLY CONDUCT", "DRIVING UNDER THE INFLUENCE", "DRUG/NARCOTIC",
          "DRUNKENNESS", "EMBEZZLEMENT", "EXTORTION", "FAMILY OFFENSES",
          "FORGERY/COUNTERFEITING", "FRAUD", "GAMBLING", "KIDNAPPING", "LARCENY/THEFT",
          "LIQUOR LAWS", "LOITERING", "MISSING PERSON", "NON-CRIMINAL", "OTHER OFFENSES",
          "PORNOGRAPHY/OBSCENE MAT", "PROSTITUTION", "RECOVERED VEHICLE", "ROBBERY",
          "RUNAWAY", "SECONDARY CODES", "SEX OFFENSES FORCIBLE", "SEX OFFENSES NON FORCIBLE",
          "STOLEN PROPERTY", "SUICIDE", "SUSPICIOUS OCC", "TREA", "TRESPASS", "VANDALISM",
          "VEHICLE THEFT", "WARRANTS", "WEAPON LAWS"]

In [3]:
# Helper Functions

def PredictAndPrint(data, classifier, outputname):
    # Predict
    result = classifier.predict(data)

    # Output
    result = pandas.DataFrame(result)
    result = pandas.get_dummies(result, prefix='', prefix_sep='')
    
    # Add null categories to make kaggle happy
    result = result.T.reindex(header).T.fillna(0)
    result.to_csv(outputname, compression='gzip', chunksize=1000)
    
def DetermineTime(hour):
    ## Add early morning, morning, afternoon, early evening, evening, late evening
    ## 0500-0800, 0800-1100, 1100-1600, 1600-2100, 2100-0200, 0200-0500
    if hour <= 2:
        return "evening"
    elif hour <= 5:
        return "lateE"
    elif hour <= 8:
        return "earlyM"
    elif hour <= 11:
        return "morning"
    elif hour <= 16:
        return "afternoon"
    elif hour <= 21:
        return "earlyE"
    else:
        return "evening"

def DetermineLocationType(address):
    ##Add block or corner
    if "block" in address.lower():
        return "block"
    elif "/" in address:
        return "corner"
    return None

def DetermineDayTime(time):
    ##Add Day Time
    time = int(time)
    if time >= 7 and time <= 19:
        return True
    elif time < 7 or time >= 19:
        return False
    return None


#### Load data

In [4]:
# Load data
df_train = pandas.read_csv("train.csv")
df_test = pandas.read_csv("test.csv")

# Drop duplicates
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()

# Shuffle
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

#### Exploratory Data Analysis
##### Revelations:
- Add euclidean distance using X,Y
- Add different size 'plots' of land for X,Y (trimming)
- Add temperature, precipitation
- Add block or corner
- Add time of day, day of week, weekend, week of year, season of year, holiday
- Add early morning, morning, afternoon, early evening, evening, late evening
- - 0500-0800, 0800-1100, 1100-1600, 1600-2100, 2100-0200, 0200-0500
- Convert 'TREA' to 'TRESPASS'

In [5]:
print ("Exploring data, columns, and formats")
print ("\nColumns:")
# View all columns
print (df_train.columns)

# Explore 'Dates'
print ("\nDates:")
print (df_train['Dates'].head(3))
## Add time of day, day of week, weekend, week of year, season of year, holiday
## Add early morning, morning, afternoon, early evening, evening, late evening
## 0500-0800, 0800-1100, 1100-1600, 1600-2100, 2100-0200, 0200-0500

# Explore 'Category'
print ("\nCategory:")
print (df_train['Category'].head(3))
print (set(df_train['Category']))
## OUTCOME VARIABLE
## Possible issue with 'Trea' and 'Trespass'
## Convert 'TREA' to 'TRESPASS'
len(df_train[df_train['Category'] == 'TREA'])

# Explore 'Descript'
print ("\nDescript:")
print (df_train['Descript'].head(3))
## NOT IN TEST DATA
## Does not appear to be useful

# Explore 'DayOfWeek'
print ("\nDayOfWeek:")
print (df_train['DayOfWeek'].head(3))
print (set(df_train['DayOfWeek']))
total = 0
for x in set(df_train['DayOfWeek']):
    total += len(df_train[df_train['DayOfWeek'] == x])
print ("Missing values in DayOfWeek: " + str(len(df_train) - total))
## Looks good

# Explore 'PdDistrict'
print ("\nPdDistrict:")
print (df_train['PdDistrict'].head(3))
print (set(df_train['PdDistrict']))
total = 0
for x in set(df_train['PdDistrict']):
    total += len(df_train[df_train['PdDistrict'] == x])
print ("Missing values in PdDistrict: " + str(len(df_train) - total))
## Looks good

# Explore 'Resolution'
print ("\nResolution:")
print (df_train['Resolution'].head(3))
## NOT IN TEST DATA
## Does not appear to be useful

# Explore 'Address'
print ("\nAddress:")
print (df_train['Address'].head(3))
## Count Blocks, Corners
blocks = corners = neither = 0
for address in df_train['Address']:
    if "block" in address.lower():
        blocks += 1
    elif "/" in address:
        corners += 1
    else:
        neither += 1
print ("Number of blocks: " + str(blocks))
print ("Number of corners: " + str(corners))
print ("Number of neither: " + str(neither))
print ("Number of both: " + str(blocks + corners - (len(df_train) - neither)))
## Add block or corner

# Explore 'X' and 'Y'
print ("\nX and Y:")
print ("Empty X coords :" + str(len(df_train[df_train['X'] == 0])))
print ("Empty Y coords :" + str(len(df_train[df_train['Y'] == 0])))
print ("Distinct X coords :" + str(len(set(df_train['X']))))
print ("Distinct Y coords :" + str(len(set(df_train['Y']))))
## Form coordinate
df_train['XY'] = df_train['X']**2 + df_train['Y']**2
print ("Distinct X,Y pairs :" + str(len(set(df_train['XY']))))
## Trim
df_train['Ytrim5'] = df_train['Y'].apply(lambda x: round(x,5))
df_train['Xtrim5'] = df_train['X'].apply(lambda x: round(x,5))
df_train['XYtrim5'] = df_train['Xtrim5']**2 + df_train['Ytrim5']**2
print ("Distinct X coords (trim5) :" + str(len(set(df_train['Xtrim5']))))
print ("Distinct Y coords (trim5):" + str(len(set(df_train['Ytrim5']))))
print ("Distinct X,Y pairs (trim5):" + str(len(set(df_train['XYtrim5']))))
df_train['Ytrim4'] = df_train['Y'].apply(lambda x: round(x,4))
df_train['Xtrim4'] = df_train['X'].apply(lambda x: round(x,4))
df_train['XYtrim4'] = df_train['Xtrim4']**2 + df_train['Ytrim4']**2
print ("Distinct X coords (trim4) :" + str(len(set(df_train['Xtrim4']))))
print ("Distinct Y coords (trim4):" + str(len(set(df_train['Ytrim4']))))
print ("Distinct X,Y pairs (trim4):" + str(len(set(df_train['XYtrim4']))))
df_train['Ytrim3'] = df_train['Y'].apply(lambda x: round(x,3))
df_train['Xtrim3'] = df_train['X'].apply(lambda x: round(x,3))
df_train['XYtrim3'] = df_train['Xtrim3']**2 + df_train['Ytrim3']**2
print ("Distinct X coords (trim3) :" + str(len(set(df_train['Xtrim3']))))
print ("Distinct Y coords (trim3):" + str(len(set(df_train['Ytrim3']))))
print ("Distinct X,Y pairs (trim3):" + str(len(set(df_train['XYtrim3']))))
df_train['Ytrim2'] = df_train['Y'].apply(lambda x: round(x,2))
df_train['Xtrim2'] = df_train['X'].apply(lambda x: round(x,2))
df_train['XYtrim2'] = df_train['Xtrim2']**2 + df_train['Ytrim2']**2
print ("Distinct X coords (trim2) :" + str(len(set(df_train['Xtrim2']))))
print ("Distinct Y coords (trim2):" + str(len(set(df_train['Ytrim2']))))
print ("Distinct X,Y pairs (trim2):" + str(len(set(df_train['XYtrim2']))))
df_train['Ytrim1'] = df_train['Y'].apply(lambda x: round(x,1))
df_train['Xtrim1'] = df_train['X'].apply(lambda x: round(x,1))
df_train['XYtrim1'] = df_train['Xtrim1']**2 + df_train['Ytrim1']**2
print ("Distinct X coords (trim1) :" + str(len(set(df_train['Xtrim1']))))
print ("Distinct Y coords (trim1):" + str(len(set(df_train['Ytrim1']))))
print ("Distinct X,Y pairs (trim1):" + str(len(set(df_train['XYtrim1']))))
## Drop X, Y individuals
df_train = df_train.drop(['X', 'Y', \
                          'Ytrim1', 'Xtrim1', \
                          'Ytrim2', 'Xtrim2', \
                          'Ytrim3', 'Xtrim3', \
                          'Ytrim4', 'Xtrim4', \
                          'Ytrim5', 'Xtrim5', \
                          'XYtrim5', 'XYtrim4', \
                          'XYtrim3', 'XYtrim1', \
                          'XY'],axis=1)

Exploring data, columns, and formats

Columns:
Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')

Dates:
0    2011-01-02 14:00:00
1    2009-11-12 15:50:00
2    2007-03-24 23:54:00
Name: Dates, dtype: object

Category:
0             LARCENY/THEFT
1            OTHER OFFENSES
2    FORGERY/COUNTERFEITING
Name: Category, dtype: object
{'TRESPASS', 'SEX OFFENSES NON FORCIBLE', 'EMBEZZLEMENT', 'FORGERY/COUNTERFEITING', 'FAMILY OFFENSES', 'DISORDERLY CONDUCT', 'PROSTITUTION', 'VEHICLE THEFT', 'OTHER OFFENSES', 'STOLEN PROPERTY', 'KIDNAPPING', 'LARCENY/THEFT', 'BURGLARY', 'EXTORTION', 'ASSAULT', 'NON-CRIMINAL', 'DRUNKENNESS', 'GAMBLING', 'ARSON', 'RUNAWAY', 'MISSING PERSON', 'SECONDARY CODES', 'LOITERING', 'RECOVERED VEHICLE', 'LIQUOR LAWS', 'FRAUD', 'WARRANTS', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'BAD CHECKS', 'BRIBERY', 'DRUG/NARCOTIC', 'PORNOGRAPHY/OBSCENE MAT', 'SUICIDE', 'DRIVING UNDER THE INFLUENCE', 'TREA',

#### Feature Engineering
- Add euclidean distance using X,Y
- Add different size 'plots' of land for X,Y (trimming)
- Add temperature, precipitation
- Add block or corner
- Add time of day, week of year, season of year, holiday, sun up or down
- Add early morning, morning, afternoon, early evening, evening, late evening
- - 0500-0800, 0800-1100, 1100-1600, 1600-2100, 2100-0200, 0200-0500
- Convert 'TREA' to 'TRESPASS'

In [6]:
# Binarize with dummy variables
#dummies_df_train = df_train[['DayOfWeek', 'PdDistrict']]
#dummies_df_train = pandas.get_dummies(dummies_df_train)
#dummies_df_test = df_test[['DayOfWeek', 'PdDistrict']]
#dummies_df_test = pandas.get_dummies(dummies_df_test)

# Format Dates
## Hour of day
df_train['Hour'] = pandas.to_datetime(df_train['Dates']).dt.hour
df_test['Hour'] = pandas.to_datetime(df_test['Dates']).dt.hour

## Time of day

df_train['Time'] = df_train['Hour'].apply(lambda x: DetermineTime(x))
df_test['Time'] = df_test['Hour'].apply(lambda x: DetermineTime(x))

## Day
df_train['DayTime'] = df_train['Hour'].apply(lambda x: DetermineDayTime(x))
df_test['DayTime'] = df_test['Hour'].apply(lambda x: DetermineDayTime(x))

## Week of year
df_train['Week'] = pandas.to_datetime(df_train['Dates']).dt.week
df_test['Week'] = pandas.to_datetime(df_test['Dates']).dt.week

## Season of year
#df_train['Season'] = pandas.to_datetime(df_train['Dates']).dt.week
#df_test['Season'] = pandas.to_datetime(df_test['Dates']).dt.week

## Reduce to date
df_train['Date'] = pandas.to_datetime(df_train['Dates']).dt.date
df_test['Date'] = pandas.to_datetime(df_test['Dates']).dt.date

## Holidays
cal = holidaysCalendar()
holidays = cal.holidays(start=pandas.to_datetime(df_train['Date']).min(), end=pandas.to_datetime(df_train['Date']).max())
df_train['Holiday'] = (pandas.to_datetime(df_train['Date'])).dt.date.isin(holidays)
df_test['Holiday'] = (pandas.to_datetime(df_test['Date'])).dt.date.isin(holidays)

# Format Coords
## Trim 1-5
## Create euclidean distance
### CREATED IN INITIAL EDA FOR TRAIN ###
## Form coordinate
df_test['XY'] = df_test['X']**2 + df_test['Y']**2
## Trim

df_test['Ytrim5'] = df_test['Y'].apply(lambda x: round(x,5))
df_test['Xtrim5'] = df_test['X'].apply(lambda x: round(x,5))
df_test['XYtrim5'] = df_test['Xtrim5']**2 + df_test['Ytrim5']**2
df_test['Ytrim4'] = df_test['Y'].apply(lambda x: round(x,4))
df_test['Xtrim4'] = df_test['X'].apply(lambda x: round(x,4))
df_test['XYtrim4'] = df_test['Xtrim4']**2 + df_test['Ytrim4']**2
df_test['Ytrim3'] = df_test['Y'].apply(lambda x: round(x,3))
df_test['Xtrim3'] = df_test['X'].apply(lambda x: round(x,3))
df_test['XYtrim3'] = df_test['Xtrim3']**2 + df_test['Ytrim3']**2
df_test['Ytrim2'] = df_test['Y'].apply(lambda x: round(x,2))
df_test['Xtrim2'] = df_test['X'].apply(lambda x: round(x,2))
df_test['XYtrim2'] = df_test['Xtrim2']**2 + df_test['Ytrim2']**2
df_test['Ytrim1'] = df_test['Y'].apply(lambda x: round(x,1))
df_test['Xtrim1'] = df_test['X'].apply(lambda x: round(x,1))
df_test['XYtrim1'] = df_test['Xtrim1']**2 + df_test['Ytrim1']**2

## Drop X, Y individuals
df_test = df_test.drop(['X', 'Y', \
                          'Ytrim1', 'Xtrim1', \
                          'Ytrim2', 'Xtrim2', \
                          'Ytrim3', 'Xtrim3', \
                          'Ytrim4', 'Xtrim4', \
                          'Ytrim5', 'Xtrim5', \
                          'XYtrim5', 'XYtrim4', \
                          'XYtrim3', 'XYtrim1', \
                          'XY'],axis=1)
# Format Address
## Add address type
df_train['LocationType'] = df_train['Address'].apply(lambda x: DetermineLocationType(x))
df_test['LocationType'] = df_test['Address'].apply(lambda x: DetermineLocationType(x))
# Add Temperature
## Create client
#ow = openweather.OpenWeather()

## Find weather stations closest
#df_train['X'].mean
#df_train['Y'].mean
#stations = ow.find_stations_near(-122.425892, 37.774599, 1000)
#print (stations)

# Drop unnecessary columns
df_train = df_train.drop(['Date', 'Dates', 'Address', 'Resolution', 'Descript'],axis=1)
df_test = df_test.drop(['Date', 'Dates', 'Address', 'Id'],axis=1)

train_data = df_train
test_data = df_test

#### Encoding

In [7]:
# Encode time
train_data = train_data.join(pandas.get_dummies(train_data['Time'])).drop('Time', axis=1)
test_data = test_data.join(pandas.get_dummies(test_data['Time'])).drop('Time', axis=1)
# Encode LocationType
train_data = train_data.join(pandas.get_dummies(train_data['LocationType'])).drop('LocationType', axis=1)
test_data = test_data.join(pandas.get_dummies(test_data['LocationType'])).drop('LocationType', axis=1)
# Encode PdDistrict
train_data = train_data.join(pandas.get_dummies(train_data['PdDistrict'])).drop('PdDistrict', axis=1)
test_data = test_data.join(pandas.get_dummies(test_data['PdDistrict'])).drop('PdDistrict', axis=1) 
# Encode DayOfWeek
train_data = train_data.join(pandas.get_dummies(train_data['DayOfWeek'])).drop('DayOfWeek', axis=1)
test_data = test_data.join(pandas.get_dummies(test_data['DayOfWeek'])).drop('DayOfWeek', axis=1)
# Encode XY
## Re-trim
train_data['XYtrim2'] = train_data['XYtrim2'].apply(lambda x: round(x,0))
test_data['XYtrim2'] = test_data['XYtrim2'].apply(lambda x: round(x,0))
train_data = train_data.join(pandas.get_dummies(train_data['XYtrim2'])).drop('XYtrim2', axis=1)
test_data = test_data.join(pandas.get_dummies(test_data['XYtrim2'])).drop('XYtrim2', axis=1)
        

In [8]:
train_data.columns



Index([  'Category',       'Hour',    'DayTime',       'Week',    'Holiday',
        'afternoon',     'earlyE',     'earlyM',    'evening',      'lateE',
          'morning',      'block',     'corner',    'BAYVIEW',    'CENTRAL',
        'INGLESIDE',    'MISSION',   'NORTHERN',       'PARK',   'RICHMOND',
         'SOUTHERN',    'TARAVAL', 'TENDERLOIN',     'Friday',     'Monday',
         'Saturday',     'Sunday',   'Thursday',    'Tuesday',  'Wednesday',
            16398.0,      16399.0,      16400.0,      16401.0,      16402.0,
            16403.0,      16404.0,      16405.0,      16406.0,      16407.0,
            16408.0,      16409.0,      16410.0,      16411.0,      16412.0,
            16413.0,      16414.0,      16415.0,      16416.0,      16417.0,
            16418.0,      16419.0,      16420.0,      16421.0,      16422.0,
            16423.0,      16424.0,      16425.0,      16426.0,      16427.0,
            16428.0,      16429.0,      16430.0,      16431.0,      16432.0,

In [9]:
train_data.columns

Index([  'Category',       'Hour',    'DayTime',       'Week',    'Holiday',
        'afternoon',     'earlyE',     'earlyM',    'evening',      'lateE',
          'morning',      'block',     'corner',    'BAYVIEW',    'CENTRAL',
        'INGLESIDE',    'MISSION',   'NORTHERN',       'PARK',   'RICHMOND',
         'SOUTHERN',    'TARAVAL', 'TENDERLOIN',     'Friday',     'Monday',
         'Saturday',     'Sunday',   'Thursday',    'Tuesday',  'Wednesday',
            16398.0,      16399.0,      16400.0,      16401.0,      16402.0,
            16403.0,      16404.0,      16405.0,      16406.0,      16407.0,
            16408.0,      16409.0,      16410.0,      16411.0,      16412.0,
            16413.0,      16414.0,      16415.0,      16416.0,      16417.0,
            16418.0,      16419.0,      16420.0,      16421.0,      16422.0,
            16423.0,      16424.0,      16425.0,      16426.0,      16427.0,
            16428.0,      16429.0,      16430.0,      16431.0,      16432.0,

In [10]:


le = preprocessing.LabelEncoder()
for column in train_data.columns:
    if train_data[column].dtype == type(object):
        train_data[column] = le.fit_transform(train_data[column])
for column in test_data.columns:
    if test_data[column].dtype == type(object):
        test_data[column] = le.fit_transform(test_data[column])

In [11]:
# Split into data/labels, train/dev
train_labels = train_data['Category'][:-50000]
dev_labels = train_data['Category'][-50000:]

train_data = train_data[train_data.columns.difference(['Category'])][:-50000]
dev_data = train_data[train_data.columns.difference(['Category'])][-50000:]


# Labels list
labels = list(set(train_labels))
train_labels = train_labels.apply(lambda x: labels.index(x))
dev_labels = dev_labels.apply(lambda x: labels.index(x))

### Baseline Model
##### K-Nearest Neighbor with k=50

Need to add GridSearchCV here

In [None]:

parameters = {'n_neighbors': range(5,50, 5)}

neigh = GridSearchCV(KNeighborsClassifier(), parameters)
neigh.fit(train_data, train_labels)
print (sorted(neigh.cv_results_.keys()))

# Train KNN Classifier
#neigh = KNeighborsClassifier(n_neighbors=15)

# Fit
#neigh.fit(train_data, train_labels) 

# Score
#print (neigh.score(dev_data, dev_labels))

#if WRITE:
#    PredictAndPrint(test_data, neigh, "output_knn")


### Logistic Regression
##### Final Parameters:

In [27]:

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

#parameters = {'n_estimators': range(10,50, 10), 'min_samples_split': range(2,5, 1)}
#rfc = GridSearchCV(RandomForestClassifier(), parameters)
#rfc.fit(train_data, train_labels)
#print (sorted(rfc.cv_results_.keys()))

# Train RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=3, random_state=0)

# Fit
rfc.fit(train_data, train_labels)

# Score
print (rfc.score(dev_data, dev_labels))


0.10686


### Support Vector Machines
##### Final Parameters:

### Random Forests
##### Final Parameters:

In [14]:
"""
# Predict
rfc_result = rfc.predict(test_data)

# Output
rfc_result = pandas.DataFrame(rfc_result)
rfc_result = pandas.get_dummies(rfc_result, prefix='', prefix_sep='')
# Add null categories to make kaggle happy
rfc_result = rfc_result.T.reindex(header).T.fillna(0)
rfc_result.to_csv("output_rfc.csv", compression='gzip', chunksize=1000)
"""

'\n# Predict\nrfc_result = rfc.predict(test_data)\n\n# Output\nrfc_result = pandas.DataFrame(rfc_result)\nrfc_result = pandas.get_dummies(rfc_result, prefix=\'\', prefix_sep=\'\')\n# Add null categories to make kaggle happy\nrfc_result = rfc_result.T.reindex(header).T.fillna(0)\nrfc_result.to_csv("output_rfc.csv", compression=\'gzip\', chunksize=1000)\n'

### Neural-Network
##### Final Parameters:

In [15]:
numFeatures = len(train_data.columns)
numClasses = len(labels)
numSamples = len(train_data)
#numTestExamples = len(dev_data)

print (numFeatures)
print (numClasses)
print (numSamples)
#print (numTestExamples)

69
39
825726


#### Sequential

In [16]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, ActivityRegularization
from keras.layers import Flatten, Reshape
from keras.wrappers.scikit_learn import KerasClassifier
from keras import optimizers

# All parameter gradients will be clipped to
# a maximum value of 0.5 and
# a minimum value of -0.5.
sgd = optimizers.SGD(lr=0.01, clipvalue=.5)

# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 2, activation='relu', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures), activation='softmax' ))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=1000.0, l2=1000.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Using TensorFlow backend.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a0d7d6438>

In [17]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='relu', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=1000.0, l2=1000.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a7918ef98>

In [18]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 2, activation='relu', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures), activation='softmax' ))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=1000.0, l2=1000.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a01a63e10>

In [19]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='relu', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=1000.0, l2=1000.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a02352f98>

In [20]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='sigmoid', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=1000.0, l2=1000.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a02d06e48>

In [21]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='sigmoid', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=10.0, l2=10.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a01a524a8>

In [22]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='sigmoid', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=10.0, l2=10.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a0ae5ae10>

In [23]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='relu', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=10.0, l2=10.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a0b31cfd0>

In [24]:
# Define
sequential_model = Sequential()
sequential_model.add(Dense(units=numFeatures * 4, activation='relu', input_shape=(numFeatures,)))
sequential_model.add(Dense(units=int(numFeatures * 2), activation='softmax' ))
sequential_model.add(Dense(units=int(numFeatures)))
#sequential_model.add(Dropout(0.05, noise_shape=None, seed=None))
sequential_model.add(ActivityRegularization(l1=1000.0, l2=1000.0))

# Compile
sequential_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# Fit
sequential_model.fit(train_data.values, train_labels.values, epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14a0b982b70>

In [None]:
# Score
sequential_model.evaluate(dev_data, dev_labels, batch_size=128)

# Predict
keras_seq_result = sequential_model.predict(test_data, batch_size=128)

# Output
keras_seq_result = pandas.DataFrame(keras_seq_result)
keras_seq_result = pandas.get_dummies(keras_seq_result, prefix='', prefix_sep='')
# Add null categories to make kaggle happy
keras_seq_result = keras_seq_result.T.reindex(header).T.fillna(0)
keras_seq_result.to_csv("output_keras_seq.csv", compression='gzip', chunksize=1000)

#### Dense

In [None]:
# (1) Parameters
tf.reset_default_graph()

# Constants
testX = tf.constant(dev_data.values, dtype=tf.float32)
hiddenlayer1_size = 2
hiddenlayer2_size = 1
miniBatchSize = 1

# placeholders
x_ = tf.placeholder(tf.float32, shape=[None, numFeatures], name='x')
y_ = tf.placeholder(tf.int32, shape=[None], name='y')

# and Variables
w1 = tf.get_variable('w1', shape=[numFeatures, hiddenlayer1_size])
b1 = tf.get_variable('b1', shape=[hiddenlayer1_size])
w2 = tf.get_variable('w2', shape=[hiddenlayer1_size, numClasses])
b2 = tf.get_variable('b3', shape=[numClasses])


# (2) Model
def model(input_layer):
    hidden_layer1 = tf.nn.sigmoid(tf.matmul(input_layer, w1) + b1)
    output_layer = tf.nn.softmax(tf.matmul(hidden_layer1, w2) + b2)
    return output_layer

# (2) Model
def model_r(input_layer):
    hidden_layer1 = tf.nn.relu(tf.matmul(input_layer, w1) + b1)
    output_layer = tf.nn.softmax(tf.matmul(hidden_layer1, w2) + b2)
    return output_layer
    

# (3) Cost
def cost(data, labels):
    cc = tf.sqrt(tf.square(labels - model(data)))
    return  cc

# (4) Ojbective (and solver)
y_one_hot = tf.one_hot(y_, numClasses)
cc = cost(x_, y_one_hot)
gd = tf.train.GradientDescentOptimizer(0.1)
step = gd.minimize(cc)
test_preds = model(testX)
test_preds_r = model_r(testX)
output = ""
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    cost_vec = []
    cost_vec_r = []
    for i in range(15):
        print (i)
        for start, end in zip(range(0, numSamples, miniBatchSize), range(miniBatchSize, numSamples, miniBatchSize)):
            batch = train_data.values[start:end], train_labels[start:end]
            _, cost, test__preds_r = sess.run([step, cc, test_preds_r], feed_dict={x_: batch[0], y_: batch[1]})
    
    prediction=tf.argmax(test_preds_r,axis=1)
    output = prediction.eval(feed_dict={x_: test_data.values})
    print ("hi")