In [2]:


from pyspark.sql.functions import *
# Import StringIndexer class
from pyspark.ml.feature import StringIndexer
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import Bucketizer
# Import the necessary class
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
# Import class for creating a pipeline

from pyspark.ml import Pipeline
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('SF_CRIME').getOrCreate()

In [4]:
df_sf = spark.read.format("csv").load("./Police_Department_Incident_Reports__Historical_2003_to_May_2018_1.csv",sep=',',header=True,inferSchema=True ,nullValue='NONE')
df_sf = df_sf.withColumnRenamed('Incident Code','IncidentCode')

In [5]:
df_sf = df_sf.withColumn("Hour", hour(col("Time")))\
              .withColumn("Minute", minute(col("Time")))\
              .withColumn("Date", to_date(col("Date"), "MM/dd/yyyy"))\
              .withColumn("Year", year(col("Date")))\
              .withColumn("Month", month(col("Date")))\
              .withColumn("Day", dayofmonth(col("Date")))\
              .withColumn("Week", weekofyear(col("Date")))\
              .withColumn("X", df_sf.X.cast("double"))\

SyntaxError: unexpected EOF while parsing (<ipython-input-5-2af2b324685f>, line 8)

In [4]:
# dropna
df_sf = df_sf.filter('PdDistrict is not NULL')

In [5]:
df_sf.dtypes

[('PdId', 'double'),
 ('IncidntNum', 'int'),
 ('IncidentCode', 'int'),
 ('Category', 'string'),
 ('Descript', 'string'),
 ('DayOfWeek', 'string'),
 ('Date', 'string'),
 ('Time', 'string'),
 ('PdDistrict', 'string'),
 ('Resolution', 'string'),
 ('Address', 'string'),
 ('X', 'double'),
 ('Y', 'double'),
 ('location', 'string')]

In [6]:
if df_sf.is_cached:
  df_sf.unpersist()
df_sf = df_sf.cache()
display(df_sf)

DataFrame[PdId: double, IncidntNum: int, IncidentCode: int, Category: string, Descript: string, DayOfWeek: string, Date: string, Time: string, PdDistrict: string, Resolution: string, Address: string, X: double, Y: double, location: string]

In [7]:
indexer_df_sf = StringIndexer(inputCol='Category',outputCol='Category_idx')
# Assign index values to strings
indexer_df_sf = indexer_df_sf.fit(df_sf)
# Create column with index values
df_sf = indexer_df_sf.transform(df_sf)
display(df_sf)
df_sf.select('Category','Category_idx').distinct().show()

DataFrame[PdId: double, IncidntNum: int, IncidentCode: int, Category: string, Descript: string, DayOfWeek: string, Date: string, Time: string, PdDistrict: string, Resolution: string, Address: string, X: double, Y: double, location: string, Category_idx: double]

+--------------------+------------+
|            Category|Category_idx|
+--------------------+------------+
|        PROSTITUTION|        21.0|
|             SUICIDE|        28.0|
|          BAD CHECKS|        33.0|
|           EXTORTION|        31.0|
|SEX OFFENSES, NON...|        34.0|
|   RECOVERED VEHICLE|        19.0|
|           LOITERING|        32.0|
|            GAMBLING|        35.0|
|     SECONDARY CODES|        14.0|
|SEX OFFENSES, FOR...|        17.0|
|            WARRANTS|         6.0|
|FORGERY/COUNTERFE...|        18.0|
|     FAMILY OFFENSES|        30.0|
|  DISORDERLY CONDUCT|        22.0|
|        EMBEZZLEMENT|        26.0|
|             ASSAULT|         3.0|
|        NON-CRIMINAL|         2.0|
|             BRIBERY|        29.0|
|            BURGLARY|         7.0|
|      MISSING PERSON|        10.0|
+--------------------+------------+
only showing top 20 rows



In [8]:
# Split into training and testing sets in a 80:20 ratio
df_sf_train, df_sf_test = df_sf.randomSplit([.8,.2], seed=17)

In [15]:
df_sf_train.show(5)

+----------+----------+------------+--------------------+--------------------+---------+----------+-----+----------+--------------+--------------------+------------+-----------+--------------------+------------+
|      PdId|IncidntNum|IncidentCode|            Category|            Descript|DayOfWeek|      Date| Time|PdDistrict|    Resolution|             Address|           X|          Y|            location|Category_idx|
+----------+----------+------------+--------------------+--------------------+---------+----------+-----+----------+--------------+--------------------+------------+-----------+--------------------+------------+
|7.12764E12|  71276391|       62050|            WARRANTS|ENROUTE TO OUTSID...|Wednesday|02/13/2013|22:41|  NORTHERN|ARREST, BOOKED|CHESTNUT ST / FIL...|-122.4362753|37.80081266|POINT (-122.43627...|         6.0|
|7.12764E12|  71276391|       63010|            WARRANTS|      WARRANT ARREST|Wednesday|02/13/2013|22:41|  NORTHERN|ARREST, BOOKED|CHESTNUT ST / FIL...|

In [9]:
# Create an indexer
indexer = StringIndexer(inputCols=['DayOfWeek','PdDistrict','Hour','Month','Year','Day','Week'], outputCols=['DayOfWeek_idx','PdDistrict_idx','Hour_idx','Month_idx','Year_idx','Day_idx','Week_idx'])

In [10]:
# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['DayOfWeek_idx','PdDistrict_idx','Hour_idx','Month_idx','Year_idx','Day_idx','Week_idx'], outputCols=['DayOfWeek_dummy','PdDistrict_dummy','Hour_dummy','Month_dummy','Year_dummy','Day_dummy','Week_dummy'])

In [11]:
# Create an assembler object
assembler = VectorAssembler(inputCols=['DayOfWeek_dummy','PdDistrict_dummy','Hour_dummy','Month_dummy','Year_dummy','Day_dummy','Week_dummy'], outputCol='features')

In [12]:
forest = RandomForestClassifier(labelCol = 'Category_idx', numTrees=5)

In [13]:
# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, forest])

# Train the pipeline on the training data
pipeline = pipeline.fit(df_sf_train)

# Make predictions on the testing data
predictions = pipeline.transform(df_sf_test)

Py4JJavaError: An error occurred while calling o97.fit.
: org.apache.spark.SparkException: Input column Hour does not exist.
	at org.apache.spark.ml.feature.StringIndexerBase.$anonfun$validateAndTransformSchema$2(StringIndexer.scala:128)
	at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
	at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
	at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema(StringIndexer.scala:123)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema$(StringIndexer.scala:115)
	at org.apache.spark.ml.feature.StringIndexer.validateAndTransformSchema(StringIndexer.scala:145)
	at org.apache.spark.ml.feature.StringIndexer.transformSchema(StringIndexer.scala:252)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:71)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:237)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
