In [19]:
import pyspark

In [20]:
## Pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

## Numerical frameworks
import numpy as np
import pandas as pd

## Matplotlib
import matplotlib.pyplot as plt

## Bokeh
from bokeh.io import show, output_file, output_notebook
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar
)

from bokeh.layouts import column, row, WidgetBox
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application

In [21]:
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [22]:
df = sqlContext.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load('data.csv')

In [23]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [24]:
df.groupBy("Primary Type") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-------+
|        Primary Type|  count|
+--------------------+-------+
|               THEFT|1371774|
|             BATTERY|1196547|
|     CRIMINAL DAMAGE| 751428|
|           NARCOTICS| 701699|
|       OTHER OFFENSE| 406530|
|             ASSAULT| 403897|
|            BURGLARY| 379662|
| MOTOR VEHICLE THEFT| 307555|
|             ROBBERY| 249023|
|  DECEPTIVE PRACTICE| 248402|
|   CRIMINAL TRESPASS| 188642|
|        PROSTITUTION|  67754|
|   WEAPONS VIOLATION|  66762|
|PUBLIC PEACE VIOL...|  46812|
|OFFENSE INVOLVING...|  43603|
| CRIM SEXUAL ASSAULT|  25816|
|         SEX OFFENSE|  24164|
|            GAMBLING|  14234|
|INTERFERENCE WITH...|  14209|
|LIQUOR LAW VIOLATION|  13856|
+--------------------+-------+
only showing top 20 rows



### Number of crime over the years

In [25]:
year_list = df.groupBy("Year") \
    .count() \
    .orderBy(col("count").desc()).toPandas()

In [14]:
p = figure(title="Chicago crimes history",
           x_range= years, y_range = list(reversed(months)),
           x_axis_location="above", plot_width=900, plot_height=400,
           toolbar_location='below')

p.vbar(x=Year, width=0.8, fill_alpha=0.2, line_color=None, legend="MPG 1 stddev", fill_color='red', line_color='black')

show(p)

SyntaxError: keyword argument repeated (<ipython-input-14-1a43b4e25164>, line 6)

In [86]:
# Create the blank plot
source = year_list

p = figure(plot_height = 300, plot_width = 700, 
           title = 'Histogram of crimes from 2001 to 2018',
           x_axis_label = 'Years',
           y_axis_label = 'Number of crimes')

# Add a quad glyph
p.vbar(x='Year', top='count', width=1, source = source,
       line_color='black', fill_color='red', fill_alpha = 0.75,
           hover_fill_alpha = 1.0, hover_fill_color = 'navy')

p.y_range.start = 0
p.x_range.start = 2000
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.2
p.outline_line_color = None

# Add a hover tool referring to the formatted columns
hover = HoverTool(tooltips = [('Year', '@Year'),
                             ('Num of crimes', '@count')])


# Add the hover tool to the graph
p.add_tools(hover)

show(p)

In [35]:
# bokeh basics
from bokeh.plotting import figure
from bokeh.io import show, output_notebook

# Create a blank figure with labels
p = figure(plot_width = 600, plot_height = 600, 
           title = 'Example Glyphs',
           x_axis_label = 'X', y_axis_label = 'Y')

# Example data
squares_x = [1, 3, 4, 5, 8]
squares_y = [8, 7, 3, 1, 10]
circles_x = [9, 12, 4, 3, 15]
circles_y = [8, 4, 11, 6, 10]

# Add squares glyph
p.square(squares_x, squares_y, size = 12, color = 'navy', alpha = 0.6)
# Add circle glyph
p.circle(circles_x, circles_y, size = 12, color = 'red')

# Set to output the plot in the notebook
output_notebook()
# Show the plot
show(p)

In [19]:
yr_month.head()

Unnamed: 0,Year,Month,count
0,2018,January,19702
1,2018,March,6361
2,2018,February,16577
3,2017,March,20436
4,2017,August,24580


### Month wise crime visualization

In [16]:
years = np.unique(yr_month['Year'])
number_of_months = 12
yr_month_matrix = np.zeros((len(years), number_of_months))

NameError: name 'yr_month' is not defined

In [15]:
yr_month.head()

NameError: name 'yr_month' is not defined

In [79]:
yr_month.head()

Unnamed: 0,Year,Month,count
0,2018,1,19702
1,2018,3,6361
2,2018,2,16577
3,2017,3,20436
4,2017,8,24580


In [31]:
yr_month = df_date.groupby('Year', 'Month').count().orderBy(col('Year').desc()).toPandas()

In [35]:
yr_month_matrix = pd.DataFrame(yr_month_matrix, columns = months)

In [37]:
data['Year'] = data['Year'].astype(str)
data = data.set_index('Year')
data.columns.name = 'Month'

years = list(data.index)
months = list(data.columns)

In [40]:
df_yr = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

In [36]:
yr_month_matrix['Year'] = years
data = yr_month_matrix

## Interactive individual crime vizualization

In [122]:
from bokeh.models.widgets import CheckboxGroup
from bokeh.models.widgets import RadioButtonGroup

In [96]:
individual_crimes = df.groupBy("Primary Type") \
                      .count() \
                      .orderBy(col("count").desc()) \
                      .toPandas()

In [123]:
crimes_chk = CheckboxGroup(labels = list(individual_crimes['Primary Type'][0:10]), 
                                  active = [0, 1]) 

In [125]:
crimes_rdbtn = RadioButtonGroup(labels = list(individual_crimes['Primary Type'][0:10])) 

In [134]:
def modify_doc(doc):
    
    def make_dataset(crime_type):
        ind_crime = df_date.filter(df_date["Primary Type"] == crime_type)\
        .groupby('Year', 'Month') \
        .count()\
        .orderBy(col('Year')\
        .desc())\
        .toPandas()

        return ind_crime_count

    def make_plot(yr_month):

        months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', \
                  'August', 'September', 'October', 'November', 'December']
        yr_month['Month'] = yr_month['Month'].map(lambda x: months[x-1])

        years = np.unique(yr_month['Year'])
        number_of_months = 12
        yr_month_matrix = np.zeros((len(years), number_of_months))

        for i, year in enumerate(years):
            no_yr = yr_month[yr_month['Year'] == year]

            for j in range(len(no_yr)):
                yr_month_matrix[i][j] = no_yr.iloc[j]['count']

        yr_month_matrix = pd.DataFrame(yr_month_matrix, columns = months)

        yr_month_matrix['Year'] = years
        data = yr_month_matrix

        data['Year'] = data['Year'].astype(str)
        data = data.set_index('Year')
        data.columns.name = 'Month'

        years = list(data.index)
        months = list(data.columns)

        df_yr = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

        # this is the colormap from the original NYTimes plot
        colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]

        mapper = LinearColorMapper(palette=colors, low=df_yr.rate.min(), high=df_yr.rate.max())
        source = ColumnDataSource(df_yr)

        p = figure(title="Chicago crimes ({0} - {1})".format(years[0], years[-1]),
                   x_range= years, y_range = list(reversed(months)),
                   x_axis_location="above", plot_width=900, plot_height=400,
                   toolbar_location='below')

        p.rect(x="Year", y="Month", width=1, height=1,
               source=source,
               fill_color={'field': 'rate', 'transform': mapper},
               line_color='black')

        color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="6pt",
                             ticker=BasicTicker(desired_num_ticks=len(colors)),
                             label_standoff = 6, border_line_color= None, location=(0, 0))

        p.add_layout(color_bar, 'right')

        p.grid.grid_line_color = None
        p.axis.axis_line_color = None
        p.axis.major_tick_line_color = None
        p.axis.major_label_text_font_size = "12pt"
        p.axis.major_label_standoff = 0
        p.xaxis.major_label_orientation = 1.55

        # Add a hover tool referring to the formatted columns
        hover = HoverTool(tooltips = [('Date', '@Month @Year'),('No of crimes', '@rate')])
        p.add_tools(hover)  
        return p
    

    def update(attr, old, new):
        new_src = make_dataset(crime_type)
        src.data.update(new_src.data)
        

    crimes = CheckboxGroup(labels=available_carriers, active = [0, 1])
    crimes.on_change('active', update)

    controls = WidgetBox(crimes)

    selected_crime = crimes.active

    src = make_dataset(selected_crime)

    p = make_plot(src)

    layout = row(controls, p)
    doc.add_root(layout)

In [136]:
# Set up an application
handler = FunctionHandler(modify_doc)
app = Application(handler)

In [145]:
crimes.active

[0, 1]

In [34]:
import numpy as np

from bokeh.io import curdoc
from bokeh.layouts import row, column, widgetbox, gridplot
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import Slider, TextInput, MultiSelect
from bokeh.plotting import figure
from bokeh.sampledata.autompg import autompg

In [146]:
import matplotlib.pyplot as plt

In [147]:
show(app)

ERROR:tornado.application:Uncaught exception GET /autoload.js?bokeh-autoload-element=e76acc88-946c-48e4-aa6c-8ce4b1e7d45e&bokeh-absolute-url=http://localhost:59918 (::1)
HTTPServerRequest(protocol='http', host='localhost:59918', method='GET', uri='/autoload.js?bokeh-autoload-element=e76acc88-946c-48e4-aa6c-8ce4b1e7d45e&bokeh-absolute-url=http://localhost:59918', version='HTTP/1.1', remote_ip='::1', headers={'Host': 'localhost:59918', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36', 'Accept': '*/*', 'Referer': 'http://localhost:8888/lab', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Cookie': '_ga=GA1.1.312361907.1498613469; _xsrf=2|b144c817|54a1c4b59c7e922835d81d9bf5eb2615|1520450803; username-localhost-8891="2|1:0|10:1520539164|23:username-localhost-8891|44:YjQxMmRhYmEwMTFmNGViNGExZDg2NjNkMGUzZmRiMmU=|874a15fea66163564314d84d3d66b7478d6c

In [41]:
for w in [mpg, cyl, hp, yr, origin]:
    w.on_change('value', update_data)

# Set up layouts and add to document
inputs_1 = widgetbox(mpg, cyl, hp)
inputs_2 = widgetbox(yr, origin)
inputs_row = row(inputs_1, inputs_2, width=800)
layout = column(inputs_row, p, width=800)

curdoc().add_root(layout)
curdoc().title = "AutoMPG"

In [33]:
autompg

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl
