In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.sql.functions import UserDefinedFunction, col
from pyspark.sql.types import *
from functools import reduce
import matplotlib.pyplot as plt

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4,application_1520184071391_0008,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
crime_one = spark.read.csv("wasb:///ChicagoCrimeData/Chicago_Crimes_2001_to_2004.csv", header=True, inferSchema=True)
crime_two = spark.read.csv("wasb:///ChicagoCrimeData/Chicago_Crimes_2005_to_2007.csv", header=True, inferSchema=True)
crime_three = spark.read.csv("wasb:///ChicagoCrimeData/Chicago_Crimes_2008_to_2011.csv", header=True, inferSchema=True)
crime_four = spark.read.csv("wasb:///ChicagoCrimeData/Chicago_Crimes_2012_to_2017.csv", header=True, inferSchema=True)

In [None]:
crime_one.printSchema();

In [None]:
crime_two.printSchema();

In [None]:
crime_three.printSchema();

In [None]:
crime_four.printSchema();

In [3]:
crime_schema = {field.name:field.dataType for field in crime_two.schema.fields}
crime_one_cols = crime_one.columns

for i in crime_one_cols:
    crime_one = crime_one.withColumn(i, crime_one[i].cast(crime_schema[i]))
    
ChicagoCrime = crime_one.union(crime_two).union(crime_three).union(crime_four)

OldColumnNames = ChicagoCrime.columns
NewColumnNames = ['_c0', 'ID', 'CaseNumber', 'Date', 'Block', 'IUCR', 'PrimaryType', 'Description', 'LocationDescription', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'CommunityArea', 'FBICode', 'XCoordinate', 'YCoordinate', 'Year', 'UpdatedOn', 'Latitude', 'Longitude', 'Location']
ChicagoCrime = reduce(lambda ChicagoCrime, idx: ChicagoCrime.withColumnRenamed(OldColumnNames[idx], NewColumnNames[idx]), range(len(OldColumnNames)), ChicagoCrime)

ChicagoCrime_Cols = ['ID', 'CaseNumber', 'Date', 'Block', 'IUCR', 'PrimaryType', 'Description', 'LocationDescription', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'CommunityArea', 'FBICode', 'Year', 'UpdatedOn']
ChicagoCrime = ChicagoCrime.select(ChicagoCrime_Cols)
ChicagoCrime.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- CaseNumber: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- PrimaryType: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- LocationDescription: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: double (nullable = true)
 |-- Ward: double (nullable = true)
 |-- CommunityArea: double (nullable = true)
 |-- FBICode: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- UpdatedOn: string (nullable = true)

In [4]:
ChicagoCrime.fillna('Missing Data', ChicagoCrime_Cols)

DataFrame[ID: int, CaseNumber: string, Date: string, Block: string, IUCR: string, PrimaryType: string, Description: string, LocationDescription: string, Arrest: boolean, Domestic: boolean, Beat: int, District: double, Ward: double, CommunityArea: double, FBICode: string, Year: int, UpdatedOn: string]

In [5]:
ChicagoCrime.registerTempTable("ChicagoCrime")

In [None]:
%%sql
SELECT *
FROM ChicagoCrime
LIMIT 10

In [None]:
%%sql
SELECT DISTINCT(PrimaryType)
FROM ChicagoCrime

In [6]:
# Use this to create new data frame based on out from query
ChicagoCrimeLabeled = sqlContext.sql(
"SELECT ID ,CaseNumber ,Date ,Block ,IUCR, PrimaryType ,Description ,LocationDescription ,Arrest ,Domestic ,Beat ,District ,Ward "
+ ",CommunityArea ,FBICode ,Year ,UpdatedOn "
+ ",CASE "
+ "WHEN PrimaryType IN ('OFFENSE INVOLVING CHILDREN', 'ARSON', 'DOMESTIC VIOLENCE', 'ASSAULT', 'ROBBERY', 'HOMICIDE', 'CRIM SEXUAL ASSAULT', 'SEX OFFENSE', 'BURGLARY') "
+ "THEN 'SERIOUS CRIME' "
+ "ELSE 'NON-SERIOUS CRIME' "
+ "END AS SeriousCrime "
+ ",CASE "
+ "WHEN PrimaryType IN ('OFFENSE INVOLVING CHILDREN', 'ARSON', 'DOMESTIC VIOLENCE', 'ASSAULT', 'ROBBERY', 'HOMICIDE', 'CRIM SEXUAL ASSAULT', 'SEX OFFENSE', 'BURGLARY')"
+ "THEN 1 "
+ "ELSE 0 "
+ "END AS SeriousCrimeIndicator "
+ "FROM ChicagoCrime"
)

In [8]:
ChicagoCrimeLabeled.printSchema()

An error was encountered:
Session 4 unexpectedly reached final status 'error'. See logs:



## Ignore the lines below: Error occurs when trying to execute the model fitting.

In [None]:
%%local
%matplotlib inline

labels = ChicagoCrimeCount['SeriousCrimeIndicator']
sizes = ChicagoCrimeCount['SeriousCrimeCount']
colors = ['red', 'blue']
plt.pie(sizes, labels = labels, autopct='%1.1f%%', colors = colors)
plt.axis('equal')

In [None]:
def labelForResults(s):
    if s == 'OFFENSE INVOLVING CHILDREN' or s == 'ARSON' or s == 'DOMESTIC VIOLENCE' or s == 'ASSAULT' or s == 'ROBBERY' or s == 'HOMICIDE' or s == 'CRIM SEXUAL ASSAULT' or s == 'SEX OFFENSE' or s == 'BURGLARY':
        return 1.0
    else:
        return 0.0
label = UserDefinedFunction(labelForResults, DoubleType())
labeledData = ChicagoCrime.select(label(ChicagoCrime.PrimaryType).alias('label'), ChicagoCrime.LocationDescription).where('label >= 0')

In [None]:
labeledData.take(1)

In [None]:
tokenizer = Tokenizer(inputCol="LocationDescription", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

model = pipeline.fit(labeledData)
model