In [None]:
%matplotlib inline
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import scipy.stats as st
import random
from datetime import datetime

In [None]:
cur_dir = os.path.dirname('__file__')

train = pd.read_csv(os.path.join(cur_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(cur_dir, "data", "test.csv"))

In [None]:
def get_random_subset(df, n):
    sub = random.sample(xrange(len(df)), n)
    return df.iloc[sub]

def preprocess(df):
    res = df.copy()
    res = res[res.X != res.X.max()]
    res.drop(res.X.idxmax(), inplace=True)
    datetimes = res.Dates.apply(get_datetime)
    res['Hour'] = datetimes.apply(lambda dt: dt.hour)
    res['Month'] = datetimes.apply(lambda dt: dt.month)
    res['Hour_Minutes'] = datetimes.apply(lambda dt: dt.hour + dt.minute / 60.0)
    res['Minutes_Since_03'] = datetimes.apply(lambda dt: (dt-datetime(2003, 1, 1)).total_seconds() / 60)
    res['Minutes_Since_New_Year'] = datetimes.apply(lambda dt: (dt-datetime(dt.year, 1, 1)).total_seconds() / 60)
    res['DOW'] = train.DayOfWeek.apply(lambda x: dow.index(x))
    return res

def get_datetime(s):
    dt = datetime.strptime(s, "%Y-%m-%d %H:%M:%S")
    return dt

def get_random_subset(df, n=5000):
    sub = random.sample(xrange(len(df)), n)
    return df.iloc[sub]

dow = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [None]:
train = preprocess(train)

In [None]:
most_common_cats = train.Category[train.Category != "OTHER OFFENSES"].value_counts().index[0:5]
most_common_districts = train.PdDistrict.value_counts().index[0:5]


f, axtuple = plt.subplots(1, len(most_common_cats), sharey=True)

for i in range(len(most_common_districts)):
    
    subset = train[(train.PdDistrict == most_common_districts[i]) & train.Category.isin(most_common_cats)]
    
    ax = sns.countplot(x="Category", data=subset.sort_values("Category"), ax=axtuple[i])
    ax.set_title(most_common_districts[i])
    plt.sca(ax)
    plt.xticks(rotation=90)
    
plt.gcf().set_size_inches(16, 6, forward=True)


In [None]:
most_common_cats = train.Category[train.Category != "OTHER OFFENSES"].value_counts().index[0:5]
most_common_districts = train.PdDistrict.value_counts().index[0:5]


f, axtuple = plt.subplots(1, len(most_common_cats), sharey=True)


for i in range(len(most_common_districts)):
    
    subset = train[train.PdDistrict == most_common_districts[i]]
    proportions = (subset.Category.value_counts().astype(float) / len(subset)).loc[most_common_cats]
    proportions = proportions.sort_index()
    
    
    
    ax = sns.barplot(x=proportions.index, y=proportions, ax=axtuple[i])
    ax.set_title(most_common_districts[i])
    ax.set_ylabel("")
    plt.sca(ax)
    plt.xticks(rotation=90)
    
axtuple[0].set_ylabel("Frequency")
plt.gcf().set_size_inches(16, 6, forward=True)


In [None]:
cats = [['LARCENY/THEFT', 'NON-CRIMINAL'], ['ASSAULT', 'DRUG/NARCOTIC']]

train_subset = train[train.Category.isin(sum(cats, []))]

train_subset = get_random_subset(train_subset)

xmin, xmax = train.X.min(), train.X.max()
ymin, ymax = train.Y.min(), train.Y.max()

f, axtuple = plt.subplots(2, 2, sharey=True, sharex=True)

rows, cols = axtuple.shape

for j in range(rows):
    
    for i in range(cols):

        subset = train_subset[train_subset.Category == cats[j][i]]

        x = subset.X
        y = subset.Y

        # # Peform the kernel density estimate
        xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
        positions = np.vstack([xx.ravel(), yy.ravel()])
        values = np.vstack([x, y])
        kernel = st.gaussian_kde(values)
        f = np.reshape(kernel(positions).T, xx.shape)

    #     fig = plt.figure()
        ax = axtuple[j, i]
        ax.set_xlim(xmin, xmax)
        ax.set_ylim(ymin, ymax)
        ax.set_title(cats[j][i])
        cfset = ax.contourf(xx, yy, f, cmap='Blues')
#         cset = ax.contour(xx, yy, f, colors='k')
#         ax.clabel(cset, inline=1, fontsize=10)
    

plt.gcf().set_size_inches(16, 6, forward=True)
    
# axtuple[0].set_ylabel("Frequency")
# plt.gcf().set_size_inches(16, 6, forward=True)

In [None]:
most_common_cats = train.Category[train.Category != "OTHER OFFENSES"].value_counts().index[0:8]
day_counts = train.DayOfWeek.value_counts().index[0:7]

def isWeekend(day):
    if day in ["Saturday", "Sunday"]:
        return "Weekend"
    else:
        return "Weekday"


train["Weekend"] = train["DayOfWeek"].apply(isWeekend)

day_type_counts = train.Weekend.value_counts().index

f, axtuple = plt.subplots(1, len(day_type_counts), sharey=True)


for i, day in enumerate(day_type_counts):
    
    subset = train[train.Weekend == day]
    proportions = (subset.Category.value_counts().astype(float) / len(subset)).loc[most_common_cats]
    proportions = proportions.sort_index()
    
    ax = sns.barplot(x=proportions.index, y=proportions, ax=axtuple[i])
    ax.set_title(day)
    ax.set_ylabel("")
    plt.sca(ax)
    plt.xticks(rotation=90)
    
axtuple[0].set_ylabel("Frequency")
plt.gcf().set_size_inches(16, 6, forward=True)

In [None]:
def plot_Category(df):
    cat_Count = df.groupby("Category").count()
    plt.figure()
    cat_Count.sort_values(by="Dates", ascending=1)["Dates"].plot(kind="barh")
    plt.ticklabel_format(style='plain', axis='x', scilimits=(0,0))
    plt.tight_layout()
    plt.gcf().set_size_inches(16, 10, forward=True)
    
plot_Category(train_subset)

In [None]:
categories = train.Category.value_counts().index[0:8]
bins = np.arange(0,7, 1)
for c in categories:
    subset = train[train.Category == c]
    indices = np.digitize(subset.DOW, bins)
    groups = subset.groupby(indices)
    days = [g.DOW.mean() for _, g in groups]
    crimes = [len(g) for _, g in groups]
    plt.plot(days, crimes, label=c)
plt.legend(loc=2)
plt.gcf().set_size_inches(16, 10, forward=True)
plt.gca().set_xticklabels(dow)

In [None]:

categories = train.Category.value_counts().index[0:6]
bins = np.arange(1, 24, 1)
for c in categories:
    subset = train[train.Category == c]
    indices = np.digitize(subset.Hour_Minutes, bins)
    groups = subset.groupby(indices)
    times = [g.Hour_Minutes.mean() for i, g in groups]
    crimes = [len(g) for i, g in groups]
    plt.plot(times, crimes, label=c)
plt.legend()
plt.gcf().set_size_inches(16, 6, forward=True)