###### 1. Read ANd Select  Data


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types 
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.types import StringType
from pyspark.sql.functions import split , count



In [2]:

spark = SparkSession \
        .builder \
        .appName("asses") \
        .getOrCreate()

In [3]:
df1= spark.read.options(header='True', inferSchema='True', delimiter=',') \
  .csv("./asses/2022-03/*-outcomes.csv") \
  .withColumn("districtName", F.input_file_name())

                                                                                

In [6]:
df1.printSchema()

root
 |-- Crime ID: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Reported by: string (nullable = true)
 |-- Falls within: string (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LSOA code: string (nullable = true)
 |-- LSOA name: string (nullable = true)
 |-- Outcome type: string (nullable = true)
 |-- districtName: string (nullable = false)



In [7]:
df2= spark.read.options(header='True', inferSchema='True', delimiter=',') \
  .csv("./asses/2022-03/*-street.csv") \
  .withColumn("districtName", F.input_file_name())

In [8]:
df2.printSchema()

root
 |-- Crime ID: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Reported by: string (nullable = true)
 |-- Falls within: string (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LSOA code: string (nullable = true)
 |-- LSOA name: string (nullable = true)
 |-- Crime type: string (nullable = true)
 |-- Last outcome category: string (nullable = true)
 |-- Context: string (nullable = true)
 |-- districtName: string (nullable = false)



In [9]:
outcomes = df1.select(col("Crime ID").alias("CrimeID"),
                      col("Outcome type").alias("outcomeType")
                     )
                     

In [10]:
outcomes.show(10)

+--------------------+---------------+
|             CrimeID|    outcomeType|
+--------------------+---------------+
|c026deed0f8c1fbfe...|Suspect charged|
|a91407d7616943d13...|Suspect charged|
|097f3199e65b4d2fa...|Suspect charged|
|a70d8667129c59595...|Suspect charged|
|577df8e02358a0d3c...|Suspect charged|
|52b59b0978b87cbe9...|Suspect charged|
|49c33aae4ddcc4652...|Suspect charged|
|c1104525f837da6a6...|Suspect charged|
|4f464638a7678ab23...|Suspect charged|
|1a7a1aba66ca20be5...|Suspect charged|
+--------------------+---------------+
only showing top 10 rows



In [11]:

street = df2.select(col("Crime ID").alias("CrimeID"), 
                    col("districtName"),
                    col("Latitude"),
                    col("Longitude"),
                    col("Crime type").alias("crimeType"),
                    col("Last outcome category").alias("lastOutcomeCategory")
                    )

In [13]:
street.show(10)

+--------------------+--------------------+---------+---------+--------------------+--------------------+
|             CrimeID|        districtName| Latitude|Longitude|           crimeType| lastOutcomeCategory|
+--------------------+--------------------+---------+---------+--------------------+--------------------+
|6b1be5a8275fcda29...|file:/mnt/c/Users...|50.803304|-0.445898|Violence and sexu...| Under investigation|
|6b1be5a8275fcda29...|file:/mnt/c/Users...|50.803304|-0.445898|Violence and sexu...| Under investigation|
|b466ed98f60835f37...|file:/mnt/c/Users...|50.804178|-0.530681|Violence and sexu...| Under investigation|
|cc667a5d8c2fa4f30...|file:/mnt/c/Users...|51.137084| 0.876572|Theft from the pe...|Investigation com...|
|a60d9d1cff047a67e...|file:/mnt/c/Users...|51.124173| 0.969845|Violence and sexu...| Under investigation|
|62823a01dfebac344...|file:/mnt/c/Users...|51.809402|-0.813313|Violence and sexu...| Under investigation|
|62823a01dfebac344...|file:/mnt/c/Users...|51.

###### 1. Align and join data


In [14]:
street = street.withColumn('districtName', split(col('districtName'), '-')[3])



In [15]:
street.select(col('districtName')).distinct().show()



+---------------+
|   districtName|
+---------------+
|           west|
|   metropolitan|
|     lancashire|
|      hampshire|
|     merseyside|
|         thames|
|          south|
|           avon|
|          essex|
|    northumbria|
|       northern|
|           kent|
| leicestershire|
|         sussex|
|     derbyshire|
|nottinghamshire|
|       cheshire|
| cambridgeshire|
|      cleveland|
|          devon|
+---------------+
only showing top 20 rows



                                                                                

In [16]:
street.distinct().show(5)




+--------------------+------------+---------+---------+--------------------+-------------------+
|             CrimeID|districtName| Latitude|Longitude|           crimeType|lastOutcomeCategory|
+--------------------+------------+---------+---------+--------------------+-------------------+
|f4818b555bbbd0cf8...|metropolitan|51.582311| 0.140192|Criminal damage a...|Under investigation|
|307998477ae1c5207...|metropolitan|51.589112| 0.140035|Violence and sexu...|Under investigation|
|1ca44b83eaf540d44...|metropolitan|51.563135| 0.178464|            Burglary|Under investigation|
|42bf0d19f10a1ea0b...|metropolitan|51.551164|  0.16253|               Drugs|Under investigation|
|16a31ca5bab76d8cb...|metropolitan|51.534088| 0.145297|            Burglary|Under investigation|
+--------------------+------------+---------+---------+--------------------+-------------------+
only showing top 5 rows



                                                                                

In [17]:
fin = street.join(outcomes,street.CrimeID == outcomes.CrimeID, "inner").drop(outcomes.CrimeID) \
           .select("crimeID",
                              "districtName",
                              "latitude",
                              "longitude",
                              "crimeType",
                              "lastOutcomeCategory"                              
                             )

                      
                


In [18]:
fin.show(10)
fin.printSchema()



+--------------------+--------------+---------+---------+--------------------+--------------------+
|             crimeID|  districtName| latitude|longitude|           crimeType| lastOutcomeCategory|
+--------------------+--------------+---------+---------+--------------------+--------------------+
|0006795b63a1d7fc4...|  metropolitan|51.584823|-0.029418|               Drugs|    Local resolution|
|000a3b00eb39c6404...|    derbyshire|53.190434|-1.403848|            Burglary|Investigation com...|
|000aeb72c5f31a812...|        sussex|50.825336|-0.788895|       Vehicle crime|Investigation com...|
|000d6e122438e6d3a...|leicestershire|52.642532| -1.18148|         Other theft|Investigation com...|
|000f862e7655b2320...|       suffolk| 52.32091| 0.880006|Violence and sexu...|Unable to prosecu...|
|0010576178ac7c1ee...|leicestershire|52.617839|-1.183137|       Vehicle crime|Investigation com...|
|001253213790e1514...|         south|53.353411|-1.407377|Criminal damage a...|Investigation com...|


                                                                                

###### 1. Provide statistic

In [19]:
KPI = fin.groupBy("crimeType") \
       .agg(count("crimeID").alias("KPI")) \
       .orderBy("KPI") 

In [20]:
KPI2 = fin.groupBy("lastOutcomeCategory") \
       .agg(count("crimeID").alias("KPI")) \
       .orderBy("KPI") 

In [21]:
KPI3 = fin.groupBy("districtName") \
       .agg(count("crimeID").alias("KPI")) \
       .orderBy("KPI") 

In [22]:
KPI.show()
KPI2.show()
KPI3.show()

                                                                                

+--------------------+-----+
|           crimeType|  KPI|
+--------------------+-----+
|             Robbery|  886|
|Possession of wea...| 1302|
|       Bicycle theft| 3152|
|         Other crime| 3199|
|Theft from the pe...| 3884|
|               Drugs| 6414|
|            Burglary| 8063|
|         Shoplifting|12139|
|       Vehicle crime|18286|
|        Public order|18333|
|         Other theft|19405|
|Criminal damage a...|21388|
|Violence and sexu...|58656|
+--------------------+-----+



                                                                                

+--------------------+-----+
| lastOutcomeCategory|  KPI|
+--------------------+-----+
|Suspect charged a...|   27|
|Offender given a ...|  253|
|Offender given pe...|  507|
|Formal action is ...|  763|
| Under investigation|  782|
|Further action is...| 1192|
|Further investiga...| 1562|
|Offender given a ...| 1647|
|Action to be take...| 2680|
|    Local resolution| 5046|
|Awaiting court ou...|11051|
|Unable to prosecu...|54851|
|Investigation com...|94746|
+--------------------+-----+





+----------------+----+
|    districtName| KPI|
+----------------+----+
|            city| 180|
|           gwent| 327|
|       wiltshire|1117|
|         cumbria|1658|
|           devon|1681|
| gloucestershire|1686|
|    warwickshire|1687|
|          dorset|1848|
|          durham|1893|
|           dyfed|1920|
|   hertfordshire|2083|
|         suffolk|2154|
|    bedfordshire|2203|
|northamptonshire|2777|
|         norfolk|2904|
|           north|3187|
|        cheshire|3196|
|    lincolnshire|3204|
|            avon|3253|
|      derbyshire|3253|
+----------------+----+
only showing top 20 rows



                                                                                