# Classidication Crime Category

In [0]:
from pyspark.sql.functions import *
# Import StringIndexer class
from pyspark.ml.feature import StringIndexer
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import Bucketizer
# Import the necessary class
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
# Import class for creating a pipeline
from pyspark.ml import Pipeline
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
#import SparkSession
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('SF_CRIME').getOrCreate()

In [0]:
df_sf = spark.read.format("csv").load("./Police_Department_Incident_Reports__Historical_2003_to_May_2018_1.csv",sep=',',header=True,inferSchema=True ,nullValue='NONE')
df_sf = df_sf.withColumnRenamed('Incident Code','IncidentCode')

In [0]:
df_sf = df_sf.withColumn("Hour", hour(col("Time")))\
              .withColumn("Minute", minute(col("Time")))\
              .withColumn("Date", to_date(col("Date"), "MM/dd/yyyy"))\
              .withColumn("Year", year(col("Date")))\
              .withColumn("Month", month(col("Date")))\
              .withColumn("Day", dayofmonth(col("Date")))\
              .withColumn("Week", weekofyear(col("Date")))\
              .withColumn("X", df_sf.X.cast("double"))\
              .withColumn("Y", df_sf.Y.cast("double"))

In [0]:
# dropna
df_sf = df_sf.filter('PdDistrict is not NULL')

In [0]:
df_sf.dtypes

In [0]:
if df_sf.is_cached:
  df_sf.unpersist()
df_sf = df_sf.cache()
display(df_sf)

PdId,IncidntNum,IncidentCode,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,location,Hour,Minute,Year,Month,Day,Week
13000100000000.0,130001186,64020,NON-CRIMINAL,"AIDED CASE, MENTAL DISTURBED",Tuesday,2013-01-01,10:12,SOUTHERN,PSYCHOPATHIC CASE,300 Block of MISSION ST,-122.3962112,37.79076136,POINT (-122.396211154038 37.790761361966),10,12,2013,1,1,1
13000400000000.0,130003518,28160,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,2013-01-01,17:30,RICHMOND,,1400 Block of CLEMENT ST,-122.4746799,37.78242784,POINT (-122.474679889558 37.782427844456706),17,30,2013,1,1,1
13026000000000.0,130260445,75025,NON-CRIMINAL,SEARCH WARRANT SERVICE,Tuesday,2013-01-01,0:01,BAYVIEW,"ARREST, BOOKED",23RD ST / WISCONSIN ST,-122.3986956,37.7547456,POINT (-122.39869558968901 37.7547456011189),0,1,2013,1,1,1
13000000000000.0,130000081,4134,ASSAULT,BATTERY,Tuesday,2013-01-01,0:39,SOUTHERN,,MARKET ST / 1ST ST,-122.3991523,37.79101661,POINT (-122.39915226019902 37.7910166130682),0,39,2013,1,1,1
13000300000000.0,130003007,65015,OTHER OFFENSES,TRAFFIC VIOLATION,Tuesday,2013-01-01,23:10,CENTRAL,"ARREST, BOOKED",LARKIN ST / BUSH ST,-122.4186583,37.78890984,POINT (-122.418658294829 37.7889098399421),23,10,2013,1,1,1
13000200000000.0,130001910,4134,ASSAULT,BATTERY,Tuesday,2013-01-01,15:23,SOUTHERN,EXCEPTIONAL CLEARANCE,200 Block of 6TH ST,-122.4063464,37.77916742,POINT (-122.40634642563201 37.7791674218963),15,23,2013,1,1,1
13051100000000.0,130511157,5013,BURGLARY,"BURGLARY OF APARTMENT HOUSE, UNLAWFUL ENTRY",Tuesday,2013-01-01,0:01,TARAVAL,,0 Block of THOMASMORE WY,-122.4729374,37.71225228,POINT (-122.47293744666898 37.7122522752262),0,1,2013,1,1,1
13000200000000.0,130002093,4013,ASSAULT,AGGRAVATED ASSAULT WITH A DEADLY WEAPON,Tuesday,2013-01-01,16:33,SOUTHERN,"ARREST, BOOKED",9TH ST / MISSION ST,-122.4147143,37.77623104,POINT (-122.414714295579 37.7762310404758),16,33,2013,1,1,1
13000300000000.0,130002952,3472,ROBBERY,ATTEMPTED ROBBERY WITH A KNIFE,Tuesday,2013-01-01,23:10,TENDERLOIN,,OFARRELL ST / JONES ST,-122.4129705,37.78578838,POINT (-122.412970537591 37.7857883766888),23,10,2013,1,1,1
13065200000000.0,130651822,19057,ASSAULT,THREATS AGAINST LIFE,Tuesday,2013-01-01,8:00,INGLESIDE,COMPLAINANT REFUSES TO PROSECUTE,600 Block of SAN JOSE AV,-122.4218052,37.74593903,POINT (-122.42180518927502 37.7459390289798),8,0,2013,1,1,1


In [0]:
indexer_df_sf = StringIndexer(inputCol='Category',outputCol='Category_idx')
# Assign index values to strings
indexer_df_sf = indexer_df_sf.fit(df_sf)
# Create column with index values
df_sf = indexer_df_sf.transform(df_sf)
display(df_sf)
df_sf.select('Category','Category_idx').distinct().show()

PdId,IncidntNum,IncidentCode,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,location,Hour,Minute,Year,Month,Day,Week,Category_idx
13000100000000.0,130001186,64020,NON-CRIMINAL,"AIDED CASE, MENTAL DISTURBED",Tuesday,2013-01-01,10:12,SOUTHERN,PSYCHOPATHIC CASE,300 Block of MISSION ST,-122.3962112,37.79076136,POINT (-122.396211154038 37.790761361966),10,12,2013,1,1,1,2.0
13000400000000.0,130003518,28160,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,2013-01-01,17:30,RICHMOND,,1400 Block of CLEMENT ST,-122.4746799,37.78242784,POINT (-122.474679889558 37.782427844456706),17,30,2013,1,1,1,4.0
13026000000000.0,130260445,75025,NON-CRIMINAL,SEARCH WARRANT SERVICE,Tuesday,2013-01-01,0:01,BAYVIEW,"ARREST, BOOKED",23RD ST / WISCONSIN ST,-122.3986956,37.7547456,POINT (-122.39869558968901 37.7547456011189),0,1,2013,1,1,1,2.0
13000000000000.0,130000081,4134,ASSAULT,BATTERY,Tuesday,2013-01-01,0:39,SOUTHERN,,MARKET ST / 1ST ST,-122.3991523,37.79101661,POINT (-122.39915226019902 37.7910166130682),0,39,2013,1,1,1,3.0
13000300000000.0,130003007,65015,OTHER OFFENSES,TRAFFIC VIOLATION,Tuesday,2013-01-01,23:10,CENTRAL,"ARREST, BOOKED",LARKIN ST / BUSH ST,-122.4186583,37.78890984,POINT (-122.418658294829 37.7889098399421),23,10,2013,1,1,1,1.0
13000200000000.0,130001910,4134,ASSAULT,BATTERY,Tuesday,2013-01-01,15:23,SOUTHERN,EXCEPTIONAL CLEARANCE,200 Block of 6TH ST,-122.4063464,37.77916742,POINT (-122.40634642563201 37.7791674218963),15,23,2013,1,1,1,3.0
13051100000000.0,130511157,5013,BURGLARY,"BURGLARY OF APARTMENT HOUSE, UNLAWFUL ENTRY",Tuesday,2013-01-01,0:01,TARAVAL,,0 Block of THOMASMORE WY,-122.4729374,37.71225228,POINT (-122.47293744666898 37.7122522752262),0,1,2013,1,1,1,7.0
13000200000000.0,130002093,4013,ASSAULT,AGGRAVATED ASSAULT WITH A DEADLY WEAPON,Tuesday,2013-01-01,16:33,SOUTHERN,"ARREST, BOOKED",9TH ST / MISSION ST,-122.4147143,37.77623104,POINT (-122.414714295579 37.7762310404758),16,33,2013,1,1,1,3.0
13000300000000.0,130002952,3472,ROBBERY,ATTEMPTED ROBBERY WITH A KNIFE,Tuesday,2013-01-01,23:10,TENDERLOIN,,OFARRELL ST / JONES ST,-122.4129705,37.78578838,POINT (-122.412970537591 37.7857883766888),23,10,2013,1,1,1,11.0
13065200000000.0,130651822,19057,ASSAULT,THREATS AGAINST LIFE,Tuesday,2013-01-01,8:00,INGLESIDE,COMPLAINANT REFUSES TO PROSECUTE,600 Block of SAN JOSE AV,-122.4218052,37.74593903,POINT (-122.42180518927502 37.7459390289798),8,0,2013,1,1,1,3.0


In [0]:
# Split into training and testing sets in a 80:20 ratio
df_sf_train, df_sf_test = df_sf.randomSplit([.8,.2], seed=17)

In [0]:
# Categorical columns

# Create an indexer
indexer = StringIndexer(inputCols=['DayOfWeek','PdDistrict'], outputCols=['DayOfWeek_idx','PdDistrict_idx'])

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['DayOfWeek_idx','PdDistrict_idx'], outputCols=['DayOfWeek_dummy','PdDistrict_dummy'])

# Create an assembler object
assembler = VectorAssembler(inputCols=['DayOfWeek_dummy','PdDistrict_dummy'], outputCol='features')

logistic = LogisticRegression(labelCol='Category_idx')

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler,logistic])

# Train the pipeline on the training data
pipeline = pipeline.fit(df_sf_train)

# Make predictions on the testing data
predictions = pipeline.transform(df_sf_test)

In [0]:
# 计算测试误差
evaluator = MulticlassClassificationEvaluator(
    labelCol="Category_idx", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

In [0]:
display(predictions)

PdId,IncidntNum,IncidentCode,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,location,Hour,Minute,Year,Month,Day,Week,Category_idx,DayOfWeek_idx,PdDistrict_idx,DayOfWeek_dummy,PdDistrict_dummy,features,rawPrediction,probability,prediction
130016000000.0,1300164,75000,MISSING PERSON,FOUND PERSON,Thursday,2014-02-13,9:00,RICHMOND,LOCATED,700 Block of 48TH AV,-122.5089545,37.77422811,POINT (-122.50895453984 37.7742281136725),9,0,2014,2,13,7,10.0,3.0,9.0,"Map(vectorType -> sparse, length -> 6, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 15, indices -> List(3), values -> List(1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.553078273342561, 3.5859435154836925, 3.795203790432931, 2.6874769849877493, 2.7600568617445274, 2.776951019153811, 1.9916791947590045, 2.7209259160934156, 2.5569952634751303, 1.332435816750145, 1.868944809862764, 1.3549961913534405, 1.8999999558992329, 0.2131645239952408, 1.0133929446735572, 0.42627060423012636, 0.4588590028276317, 0.33737658801878045, 0.433711508219625, -0.04543663478347182, -0.3666546672967381, 0.17065825554449, -0.7663882083291129, 0.1345398039406489, -1.0497765082534574, -0.7999393264536723, -1.2120630580497762, -1.313093107664503, -2.2564661520387816, -2.7418724151974345, -2.4745613807666698, -2.1405020139186366, -2.9064262175633098, -2.4291968812802063, -2.662715935157706, -3.5262284286527117, -4.7607943929948275, -5.620545496387496))","Map(vectorType -> dense, length -> 38, values -> List(0.32199217729353014, 0.12241201086805353, 0.1509053423237263, 0.049845387411816516, 0.053597683000070756, 0.054510862674721425, 0.02485672084577495, 0.05154086014053149, 0.04374792754415952, 0.012856953514768711, 0.02198573398185637, 0.013150307851901175, 0.022679216529007874, 0.004198022167110843, 0.009345004500116669, 0.00519511926540594, 0.005367208712842385, 0.0047532356033710265, 0.0052339198256731116, 0.0032414227150390613, 0.0023508907772697503, 0.004023319198947317, 0.0015762691691468425, 0.0038805961227892424, 0.0011872888962373425, 0.0015242609220948923, 0.001009430105615659, 9.124297982796008E-4, 3.552202802301289E-4, 2.186192957619016E-4, 2.8561446473173856E-4, 3.988966029578946E-4, 1.8544865276080708E-4, 2.98869605268477E-4, 2.3662777138768303E-4, 9.978080479717539E-5, 2.9032324982073E-5, 1.228843726368413E-5))",0.0
1308400000000.0,13083956,61030,NON-CRIMINAL,"DEATH REPORT, CAUSE UNKNOWN",Friday,2013-05-10,12:50,TENDERLOIN,,100 Block of TURK ST,-122.4116151,37.78316141,POINT (-122.411615075408 37.7831614109395),12,50,2013,5,10,19,2.0,0.0,7.0,"Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(0, 13), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(3.845590903497494, 3.523395839837581, 3.6249956501756615, 3.380896858169797, 2.1147761290021307, 1.1403818355025546, 3.0423184731020365, 1.689710507515288, 2.456031845274886, 3.265264703536051, 1.4921004656027597, 2.123976798484331, 1.5088882754806812, 1.3211500338195186, 0.7580208839484344, 1.097064209547205, 0.28386517691815494, 0.41282741090645625, 0.24260845232459877, -0.04876295521584609, 0.37168071929611024, -0.311994161280291, 0.04170710002091127, -0.9633832240381839, -0.28722360972310174, -0.9614604352074629, -0.9893441942933503, -0.8975321223929604, -1.9832913270934842, -2.195022377169816, -2.2053108567255246, -2.6838187171123975, -2.170011792140527, -4.07003549106456, -4.669201589101088, -2.829521944535906, -4.642084284844738, -5.829253190023409))","Map(vectorType -> dense, length -> 38, values -> List(0.17948191510437234, 0.13004484957451987, 0.14395189741239742, 0.11277338989154961, 0.03179343038732265, 0.011999504812960709, 0.08038294452833405, 0.020784220216675937, 0.04472422911435576, 0.10045885607467547, 0.017057398060295857, 0.03208730106243125, 0.017346171571276006, 0.014377055699861618, 0.008186647159377716, 0.011490811975673873, 0.005095453771178614, 0.005796828506385778, 0.004889509541388393, 0.0036536278030624178, 0.005563148734636345, 0.0028080474567341433, 0.003999585168505777, 0.0014638944615116786, 0.0028784729796201523, 0.0014667119292613455, 0.0014263794114727961, 0.0015635383492770165, 5.279214479075158E-4, 4.271846902866001E-4, 4.2281214134475405E-4, 2.6201980950932963E-4, 4.3800355846335375E-4, 6.551003491822693E-5, 3.5982662949773944E-5, 2.2649364390361045E-4, 3.697176606345137E-5, 1.1279485539236573E-5))",0.0
1361440000000.0,13614363,6224,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Friday,2013-10-11,10:30,CENTRAL,,SUTTER ST / STOCKTON ST,-122.4069587,37.78943476,POINT (-122.406958660602 37.789434763004),10,30,2013,10,11,41,0.0,0.0,3.0,"Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(0, 9), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.830581478803347, 3.362117596351209, 3.760307352492354, 3.0627622805886734, 3.007274704926417, 2.248421586116664, 2.18070388812391, 2.7123131673638032, 2.310534667491803, 1.3420813676204104, 1.6864729587962277, 1.8873669190977251, 2.189832923892221, 0.7075165916838572, 0.6506017633015694, 1.1815797682256486, 0.949716685094537, 0.4738360552512893, 0.6559361207619675, -0.3379601577290171, 0.0431974243007546, 0.6873752783732276, -0.4994685079487214, -0.8660386278512746, -0.6710502753578186, -0.8535924100046748, -0.6978699544052449, -1.6037360637440903, -1.906493173801794, -2.6369968468166953, -2.612785404988376, -2.1319549026313798, -2.6609467805752023, -3.4170335438057475, -4.664781645072754, -3.6007633200329017, -4.965855093738424, -5.803203870153507))","Map(vectorType -> dense, length -> 38, values -> List(0.38284919935537426, 0.08816211131844906, 0.13128454278526708, 0.06535421769420732, 0.06182664388662851, 0.02894742586818144, 0.02705207157670654, 0.04603365985331104, 0.030802454087482934, 0.011694762185328457, 0.016502842788441337, 0.020174644971954037, 0.027300161594090253, 0.006200167384350312, 0.005857140181197609, 0.009960621464700972, 0.00789931765196141, 0.004908143442721256, 0.005888467742538645, 0.0021795088205704993, 0.0031907535828910754, 0.006076537088171483, 0.0018544556252804527, 0.0012853371656209906, 0.0015620661844406966, 0.0013014347210280775, 0.0015207288751875059, 6.146659273879876E-4, 4.5410198194877657E-4, 2.1872563357106791E-4, 2.240859248701679E-4, 3.624404097297781E-4, 2.1354940176145765E-4, 1.0026146589409452E-4, 2.8790150478926497E-5, 8.34336459301045E-5, 2.1305385533877863E-5, 9.222176810699617E-6))",0.0
5132620000000.0,51326154,26070,KIDNAPPING,CHILD STEALING,Thursday,2014-02-20,9:00,PARK,EXCEPTIONAL CLEARANCE,1000 Block of 14TH ST,-122.4360766,37.76724463,POINT (-122.43607664974101 37.7672446280514),9,0,2014,2,20,8,24.0,3.0,8.0,"Map(vectorType -> sparse, length -> 6, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(3, 14), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.494528363253846, 3.768203501551472, 4.056686263230945, 3.047706353945665, 2.871123016756156, 2.9389097129256396, 2.7497559335844763, 2.94114553138799, 2.459015832859821, 2.4606411505548813, 2.992402334643585, 1.567454207313316, 1.8134509801639829, 0.7268630910713909, 1.041172718541865, 0.8611872291001006, 0.6195371491131157, 0.6041470286006768, 0.2941505346990519, 0.0030743766626921687, -0.0034553872999048274, -2.907913712102718, -0.2384429597005388, -0.25799584084419336, -0.9901241832102177, -0.5356412263177196, -1.5356660912607358, -0.7989282679160246, -1.6358439038290906, -3.372750418395531, -3.1144120237350736, -2.1563142294762856, -4.074698212670963, -3.107723547638644, -2.4426491245367665, -4.614954559997847, -4.74576706593865, -5.77787455508977))","Map(vectorType -> dense, length -> 38, values -> List(0.252069166002134, 0.12192165379184094, 0.16269241898569467, 0.05931614382328411, 0.04971457609118017, 0.05320140829916965, 0.044032636096770744, 0.05332049006306479, 0.03292364154552014, 0.03297719643277337, 0.056124783551080394, 0.013499185660409271, 0.017264047070510276, 0.00582429665321334, 0.007975288897465878, 0.006661617906545553, 0.005231574602426297, 0.005151676437044607, 0.003778494649953489, 0.002824268891184152, 0.002805887161449035, 1.5370216275794303E-4, 0.0022182801649149957, 0.002175327687291878, 0.0010460811790338387, 0.001647953027816876, 6.062329648145257E-4, 0.001266486950665319, 5.484447415722734E-4, 9.65614907683069E-5, 1.2502558671902012E-4, 3.2590834918353536E-4, 4.7857709412587925E-5, 1.2586462016691828E-4, 2.4476077606458223E-4, 2.788184816936952E-5, 2.446303904157329E-5, 8.715088891543654E-6))",0.0
6084710000000.0,60847080,63010,WARRANTS,WARRANT ARREST,Sunday,2013-03-24,17:33,SOUTHERN,"ARREST, BOOKED",800 Block of BRYANT ST,-122.4034048,37.77542071,POINT (-122.40340479147902 37.775420706711),17,33,2013,3,24,12,6.0,6.0,0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(6), values -> List(1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.66968939986517, 3.4533888904362184, 3.693086938260421, 3.314958281799561, 2.7912541464604033, 1.9252277992355469, 2.629981395998514, 2.183302310938471, 2.355037667695981, 2.206708779054113, 1.73710189642792, 2.0775274914301005, 1.6093701247314327, 1.0584297690426276, 0.9463493922466083, 1.1076884003464094, 0.7593419151057614, 0.6201717574580747, -0.05883816924376502, -0.4221897861830598, 0.6916226328084346, -1.1209419713552955, -0.13611415474091226, 0.03232088172362835, -0.3328393153016098, -0.9195853396158632, -1.0194176774396453, -1.3463470690211436, -2.4449798312741473, -2.2895141050034304, -2.6825321650449645, -3.0244793675912796, -2.6928738588840893, -2.828127900544048, -3.68466286557843, -4.269084142111966, -4.949155516165407, -5.640876635966345))","Map(vectorType -> dense, length -> 38, values -> List(0.34332042233928894, 0.10173421504370993, 0.12929048899383874, 0.08858238907474966, 0.05246933656869132, 0.02206963289317156, 0.0446545489871779, 0.02856771111592028, 0.033920259282709135, 0.029244267343124658, 0.01828492013079845, 0.025700290069190255, 0.01609236461329039, 0.009275760096018729, 0.008292273514633143, 0.009744111827902928, 0.006877922899716521, 0.005984342916555414, 0.003034772970433667, 0.002110204792770573, 0.006427575586538543, 0.0010492050852543432, 0.0028090901261975435, 0.0033244214829179788, 0.002307432586102731, 0.0012832416910757503, 0.0011613197915508273, 8.374679523424405E-4, 2.7915026884891376E-4, 3.2610387354633136E-4, 2.2012551383114546E-4, 1.5637401949274304E-4, 2.1786077396864252E-4, 1.9030007446103325E-4, 8.080729131881614E-5, 4.5044276308927566E-5, 2.2818567075164418E-5, 1.142556547588087E-5))",0.0
9087410000000.0,90874122,63010,WARRANTS,WARRANT ARREST,Friday,2013-01-18,23:33,SOUTHERN,"ARREST, BOOKED",800 Block of BRYANT ST,-122.4034048,37.77542071,POINT (-122.40340479147902 37.775420706711),23,33,2013,1,18,3,6.0,0.0,0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(0, 6), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.673626772833375, 3.5127757829278083, 3.683702134906857, 3.121401400646284, 2.710193795134467, 1.8789000134883114, 2.643629498448577, 2.3534877479412257, 2.3341974980477005, 2.187095891908114, 1.9307344709080938, 1.846563259383676, 1.7114662315963844, 1.0031465402347337, 0.708732074138885, 1.077854882829713, 0.6908544629624478, 0.44594109265298826, 0.5448923313353653, -0.16989595473431554, 0.32348617434204596, 0.20105078939679144, -0.35330018735404967, -0.3570603416932532, -0.7028600272640751, -0.9767624704831497, -0.7023868184789803, -1.3567271306242163, -2.5950915708244926, -2.195007910298389, -2.415574679176701, -2.704738580319889, -2.8702179796003375, -2.528827976525446, -4.453766135845518, -4.535363542460487, -4.963056204087392, -5.703095336293158))","Map(vectorType -> dense, length -> 38, values -> List(0.3485535170715243, 0.1091737655853332, 0.1295241233562321, 0.07381532827729145, 0.04892843913153155, 0.021307625708265495, 0.045777582584633804, 0.03424884212505228, 0.03359450485713407, 0.028998987092621675, 0.022441220766541644, 0.020629627465343494, 0.01802268498258314, 0.008875649802483487, 0.006612072096370527, 0.009564132394042512, 0.006494914412491122, 0.00508403939583232, 0.005612842866281554, 0.002746339831792066, 0.004498082451041778, 0.003979737247991741, 0.002286140037564933, 0.002277559939538396, 0.0016117249167798672, 0.0012255657071382447, 0.0016124877796521145, 8.38146711029054E-4, 2.4294345427016786E-4, 3.6245936748348747E-4, 2.907156406238709E-4, 2.1771387665101707E-4, 1.845097502577187E-4, 2.5958710527552726E-4, 3.7869807274404625E-5, 3.490244018654534E-5, 2.275680188172927E-5, 1.0857161977518703E-5))",0.0
10072500000000.0,100725015,4136,ASSAULT,BATTERY WITH SERIOUS INJURIES,Thursday,2014-03-20,23:31,SOUTHERN,"ARREST, BOOKED",TURK ST / MASON ST,-122.4089536,37.78328787,POINT (-122.408953598279 37.783287873572),23,31,2014,3,20,12,3.0,3.0,0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(3, 6), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.552266739507848, 3.461084818720808, 3.5986800561865637, 3.026103492385288, 2.5719035703425757, 1.768262393805974, 2.6926755253442645, 2.244258258650145, 2.3202881770104162, 2.308575859706626, 1.675384593722765, 1.8195669178137381, 1.5898170956348552, 0.8947590640557594, 0.7119969873265246, 1.082311599707133, 0.685782499611965, 0.4416173657332905, 0.5434029845675873, -0.20617355667452367, 0.1543710958242255, 0.6275853214881082, -0.1714968629761755, -0.5828835641378824, -0.9448529510148633, -0.8873577317617306, -0.8161518492246236, -1.3574276156317124, -2.6485058753426856, -2.7954476455017736, -2.6757935118390233, -2.545102570792931, -2.792327061617436, -2.2254889523456107, -3.1806621161438287, -4.2848027988254875, -4.9540848202245416, -5.702134933091639))","Map(vectorType -> dense, length -> 38, values -> List(0.3343926379394622, 0.11229551737729375, 0.1288603399595631, 0.07268627797606393, 0.046152572247598354, 0.0206623152034415, 0.05207706553992573, 0.03325840058549397, 0.03588564313161786, 0.03546779087837803, 0.018829668157743805, 0.021750049720733442, 0.01728546925938277, 0.008626235420201992, 0.007185363523670229, 0.010405772762164768, 0.0069994503555804305, 0.00548307728365147, 0.006070567603253822, 0.0028687476310670745, 0.004114100507574621, 0.006603728720907981, 0.0029699712206561635, 0.0019682908189931157, 0.0013705281475543292, 0.00145163628698504, 0.001558770355057842, 9.072125703464623E-4, 2.4945998694599894E-4, 2.1536984492351988E-4, 2.4274485028348225E-4, 2.766357984189673E-4, 2.1604297432289163E-4, 3.808156700323518E-4, 1.465171210065152E-4, 4.856978384105947E-5, 2.4871425755183146E-5, 1.1771360106167617E-5))",0.0
10072500000000.0,100725015,30190,OTHER OFFENSES,VIOLATION OF MUNICIPAL POLICE CODE,Thursday,2014-03-20,23:31,SOUTHERN,"ARREST, BOOKED",TURK ST / MASON ST,-122.4089536,37.78328787,POINT (-122.408953598279 37.783287873572),23,31,2014,3,20,12,1.0,3.0,0.0,"Map(vectorType -> sparse, length -> 6, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(3, 6), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(4.552266739507848, 3.461084818720808, 3.5986800561865637, 3.026103492385288, 2.5719035703425757, 1.768262393805974, 2.6926755253442645, 2.244258258650145, 2.3202881770104162, 2.308575859706626, 1.675384593722765, 1.8195669178137381, 1.5898170956348552, 0.8947590640557594, 0.7119969873265246, 1.082311599707133, 0.685782499611965, 0.4416173657332905, 0.5434029845675873, -0.20617355667452367, 0.1543710958242255, 0.6275853214881082, -0.1714968629761755, -0.5828835641378824, -0.9448529510148633, -0.8873577317617306, -0.8161518492246236, -1.3574276156317124, -2.6485058753426856, -2.7954476455017736, -2.6757935118390233, -2.545102570792931, -2.792327061617436, -2.2254889523456107, -3.1806621161438287, -4.2848027988254875, -4.9540848202245416, -5.702134933091639))","Map(vectorType -> dense, length -> 38, values -> List(0.3343926379394622, 0.11229551737729375, 0.1288603399595631, 0.07268627797606393, 0.046152572247598354, 0.0206623152034415, 0.05207706553992573, 0.03325840058549397, 0.03588564313161786, 0.03546779087837803, 0.018829668157743805, 0.021750049720733442, 0.01728546925938277, 0.008626235420201992, 0.007185363523670229, 0.010405772762164768, 0.0069994503555804305, 0.00548307728365147, 0.006070567603253822, 0.0028687476310670745, 0.004114100507574621, 0.006603728720907981, 0.0029699712206561635, 0.0019682908189931157, 0.0013705281475543292, 0.00145163628698504, 0.001558770355057842, 9.072125703464623E-4, 2.4945998694599894E-4, 2.1536984492351988E-4, 2.4274485028348225E-4, 2.766357984189673E-4, 2.1604297432289163E-4, 3.808156700323518E-4, 1.465171210065152E-4, 4.856978384105947E-5, 2.4871425755183146E-5, 1.1771360106167617E-5))",0.0
10089600000000.0,100895931,62071,WARRANTS,PROBATION SEARCH,Wednesday,2013-12-18,13:30,MISSION,"ARREST, BOOKED",2800 Block of MISSION ST,-122.418434,37.75141993,POINT (-122.41843402831802 37.7514199300011),13,30,2013,12,18,51,6.0,2.0,1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(2, 7), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(3.7498858497170735, 3.528230008451218, 3.3120630456491265, 3.0970141378729963, 2.349437885288807, 2.3386701109215986, 2.5517832520751913, 2.117502174891343, 2.2484985610994475, 2.20204175709455, 1.693079984836765, 1.8779736939399767, 1.3523166409376755, 1.1765147820511481, 0.8258467275780498, 1.222043871987324, 0.43273517973963005, 0.561578191616612, 0.26723193085407304, -0.5612314710733984, 0.10413695053511066, 0.5665807285397737, 0.5769093311647236, -0.500512254323668, -0.7350194383756092, -0.7557781725707253, -1.5487059760664084, -1.0511643658943457, -2.281360597778964, -1.7752346198047066, -2.3485833409665933, -3.076566952522285, -2.3387387557209585, -3.2628291251635475, -4.012083171436234, -3.0819733622399283, -5.122668760643594, -5.699624432261254))","Map(vectorType -> dense, length -> 38, values -> List(0.18981782884520335, 0.15208034632068504, 0.1225160464215497, 0.09880943585772389, 0.046787536448021375, 0.046286441492669805, 0.057280612691704536, 0.03710238086617447, 0.04229536595047848, 0.04037540143063947, 0.02427043521479536, 0.029199531391038588, 0.01726179379099525, 0.01447891928277174, 0.010196308005599161, 0.01515336832521495, 0.006882033304265155, 0.00782839226336298, 0.005832296874441452, 0.0025470797220487826, 0.004954597421150957, 0.00786765220248244, 0.007949335164942973, 0.0027065282164240295, 0.0021407570509005733, 0.002096775722038969, 9.488286936882859E-4, 0.0015605129808084828, 4.560368639108229E-4, 7.564978007838636E-4, 4.263885010498646E-4, 2.058951983402259E-4, 4.306068488698852E-4, 1.709045341483044E-4, 8.078982864521119E-5, 2.0478504820641054E-4, 2.6609425238252274E-5, 1.4943998986614195E-5))",0.0
10089600000000.0,100895931,63010,WARRANTS,WARRANT ARREST,Wednesday,2013-12-18,13:30,MISSION,"ARREST, BOOKED",2800 Block of MISSION ST,-122.418434,37.75141993,POINT (-122.41843402831802 37.7514199300011),13,30,2013,12,18,51,6.0,2.0,1.0,"Map(vectorType -> sparse, length -> 6, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 9, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 15, indices -> List(2, 7), values -> List(1.0, 1.0))","Map(vectorType -> dense, length -> 38, values -> List(3.7498858497170735, 3.528230008451218, 3.3120630456491265, 3.0970141378729963, 2.349437885288807, 2.3386701109215986, 2.5517832520751913, 2.117502174891343, 2.2484985610994475, 2.20204175709455, 1.693079984836765, 1.8779736939399767, 1.3523166409376755, 1.1765147820511481, 0.8258467275780498, 1.222043871987324, 0.43273517973963005, 0.561578191616612, 0.26723193085407304, -0.5612314710733984, 0.10413695053511066, 0.5665807285397737, 0.5769093311647236, -0.500512254323668, -0.7350194383756092, -0.7557781725707253, -1.5487059760664084, -1.0511643658943457, -2.281360597778964, -1.7752346198047066, -2.3485833409665933, -3.076566952522285, -2.3387387557209585, -3.2628291251635475, -4.012083171436234, -3.0819733622399283, -5.122668760643594, -5.699624432261254))","Map(vectorType -> dense, length -> 38, values -> List(0.18981782884520335, 0.15208034632068504, 0.1225160464215497, 0.09880943585772389, 0.046787536448021375, 0.046286441492669805, 0.057280612691704536, 0.03710238086617447, 0.04229536595047848, 0.04037540143063947, 0.02427043521479536, 0.029199531391038588, 0.01726179379099525, 0.01447891928277174, 0.010196308005599161, 0.01515336832521495, 0.006882033304265155, 0.00782839226336298, 0.005832296874441452, 0.0025470797220487826, 0.004954597421150957, 0.00786765220248244, 0.007949335164942973, 0.0027065282164240295, 0.0021407570509005733, 0.002096775722038969, 9.488286936882859E-4, 0.0015605129808084828, 4.560368639108229E-4, 7.564978007838636E-4, 4.263885010498646E-4, 2.058951983402259E-4, 4.306068488698852E-4, 1.709045341483044E-4, 8.078982864521119E-5, 2.0478504820641054E-4, 2.6609425238252274E-5, 1.4943998986614195E-5))",0.0


In [0]:
# Create an indexer
indexer = StringIndexer(inputCols=['DayOfWeek','PdDistrict','Hour','Month','Year','Day','Week'], outputCols=['DayOfWeek_idx','PdDistrict_idx','Hour_idx','Month_idx','Year_idx','Day_idx','Week_idx'])

In [0]:
# split_Hour=[0,1, 7, 13, 23]

# bucketizer = Bucketizer(splitsArray=[split_Hour],inputCols=['Hour'],outputCols=['Hour_bin'])

In [0]:

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['DayOfWeek_idx','PdDistrict_idx','Hour_idx','Month_idx','Year_idx','Day_idx','Week_idx'], outputCols=['DayOfWeek_dummy','PdDistrict_dummy','Hour_dummy','Month_dummy','Year_dummy','Day_dummy','Week_dummy'])

In [0]:
# Create an assembler object
assembler = VectorAssembler(inputCols=['DayOfWeek_dummy','PdDistrict_dummy','Hour_dummy','Month_dummy','Year_dummy','Day_dummy','Week_dummy'], outputCol='features')


In [0]:
forest = RandomForestClassifier(numTrees=5)

In [0]:


# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler])

# Train the pipeline on the training data
pipeline = pipeline.fit(df_sf_train)

# Make predictions on the testing data
predictions = pipeline.transform(df_sf_test)

In [0]:
display(df_sf.groupby('Hour').count().sort('Hour'))

Hour,count
0,39470
1,23394
2,18988
3,13488
4,9705
5,8859
6,12273
7,19428
8,27830
9,31115


In [0]:
display(df_sf.groupby('Day').count().sort('Day'))

Day,count
1,30033
2,25664
3,25811
4,25895
5,26099
6,25894
7,26069
8,25553
9,26014
10,26510


In [0]:
display(df_sf.groupby('Month').count().sort('Month'))

Month,count
1,75860
2,68377
3,75054
4,71826
5,67159
6,61201
7,63333
8,65027
9,63118
10,65790


In [0]:
display(df_sf.groupby('Year','Month').count().sort('Year','Month'))

Year,Month,count
2013,1,12461
2013,2,11436
2013,3,12617
2013,4,12145
2013,5,12556
2013,6,12153
2013,7,12810
2013,8,13383
2013,9,13482
2013,10,13586


In [0]:
split_Hour=[0,1, 7, 13, 23]

from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splitsArray=[split_Hour],inputCols=['Hour'],outputCols=['Hour_bin'])
