In [67]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("London Crime") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

data = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/london_crime_by_lsoa.csv")   


In [2]:
data.printSchema()

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [3]:
data.count()

13490604

In [9]:
data.limit(5).show()

+---------+----------+--------------------+--------------------+-----+----+-----+
|lsoa_code|   borough|      major_category|      minor_category|value|year|month|
+---------+----------+--------------------+--------------------+-----+----+-----+
|E01001116|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
+---------+----------+--------------------+--------------------+-----+----+-----+



In [10]:
data.dropna().count()

13490604

In [12]:
data.show()

+---------+--------------------+--------------------+--------------------+-----+----+-----+
|lsoa_code|             borough|      major_category|      minor_category|value|year|month|
+---------+--------------------+--------------------+--------------------+-----+----+-----+
|E01001116|             Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646|           Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|             Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774|           Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|          Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
|E01001320|              Ealing|  Theft and Handling|         Other Theft|    0|2012|    5|
|E01001342|              Ealing|Violence Against ...|    Offensive Weapon|    0|2010|    7|
|E01002633|            Hounslow|             Robbery|   Personal Property|    0|

In [22]:
data.where(data.major_category.contains("Sex"))\
    .show()

+---------+--------------------+---------------+--------------+-----+----+-----+
|lsoa_code|             borough| major_category|minor_category|value|year|month|
+---------+--------------------+---------------+--------------+-----+----+-----+
|E01002740|           Islington|Sexual Offences|  Other Sexual|    0|2016|   10|
|E01000562|               Brent|Sexual Offences|          Rape|    0|2012|   12|
|E01001646|           Greenwich|Sexual Offences|          Rape|    0|2013|    5|
|E01003120|             Lambeth|Sexual Offences|          Rape|    0|2014|    3|
|E01004757|         Westminster|Sexual Offences|  Other Sexual|    0|2012|    7|
|E01004740|         Westminster|Sexual Offences|          Rape|    0|2010|   10|
|E01002782|           Islington|Sexual Offences|  Other Sexual|    0|2010|    9|
|E01003631|              Newham|Sexual Offences|          Rape|    0|2015|    5|
|E01004552|          Wandsworth|Sexual Offences|  Other Sexual|    0|2015|   12|
|E01003404|              Mer

In [23]:
slim_data = data.drop("lsoa_code")

slim_data.show(5)

+----------+--------------------+--------------------+-----+----+-----+
|   borough|      major_category|      minor_category|value|year|month|
+----------+--------------------+--------------------+-----+----+-----+
|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
+----------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [28]:
boroughs = data.select("borough")\
                .distinct()\
                .count()
boroughs


33

In [33]:
sex_in_2015_2016 = data\
                    .where(data.major_category.contains("Sex") & data.year.isin(2015, 2016))

print(f"Total count = {sex_in_2015_2016.count()}")
sex_in_2015_2016.sample(fraction=0.1).show()


Total count = 24024
+---------+--------------------+---------------+--------------+-----+----+-----+
|lsoa_code|             borough| major_category|minor_category|value|year|month|
+---------+--------------------+---------------+--------------+-----+----+-----+
|E01004650|         Westminster|Sexual Offences|  Other Sexual|    0|2015|   11|
|E01003934|           Southwark|Sexual Offences|  Other Sexual|    0|2015|    3|
|E01001953|Hammersmith and F...|Sexual Offences|  Other Sexual|    0|2016|    8|
|E01002358|            Havering|Sexual Offences|  Other Sexual|    0|2015|    1|
|E01001481|             Enfield|Sexual Offences|  Other Sexual|    0|2015|    5|
|E01001235|              Ealing|Sexual Offences|  Other Sexual|    0|2016|    4|
|E01001495|             Enfield|Sexual Offences|  Other Sexual|    0|2015|    3|
|E01001650|           Greenwich|Sexual Offences|  Other Sexual|    0|2016|   10|
|E01002397|          Hillingdon|Sexual Offences|  Other Sexual|    0|2016|    6|
|E010022

In [45]:
crimes_per_borough = data.groupBy("borough").count().orderBy("count", ascending=False)
crimes_per_borough.show(5)

crimes_per_borough = crimes_per_borough.orderBy("count", ascending=True)
crimes_per_borough.show(5)
                            

+-------+------+
|borough| count|
+-------+------+
|Croydon|602100|
| Barnet|572832|
| Ealing|549396|
|Bromley|523908|
|Lambeth|519048|
+-------+------+
only showing top 5 rows

+--------------------+------+
|             borough| count|
+--------------------+------+
|      City of London|  9720|
|Kingston upon Thames|259524|
|Kensington and Ch...|296784|
|Richmond upon Thames|304128|
|Barking and Dagenham|311040|
+--------------------+------+
only showing top 5 rows



In [48]:
borough_conviction_sum_per_borough = data.groupBy("borough")\
                                .agg({"value": "sum"})\
                                .withColumnRenamed("sum(value)", "convictions")

borough_conviction_sum_per_borough.show(5)

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|   260294.0|
|          Wandsworth|   204741.0|
|              Bexley|   114136.0|
|             Lambeth|   292178.0|
|Barking and Dagenham|   149447.0|
+--------------------+-----------+
only showing top 5 rows



In [52]:
total_convictions = borough_conviction_sum_per_borough.agg({"convictions": "sum"})\
                                                        .withColumnRenamed("sum(convictions)", "Total")
total_convictions.show()

total = total_convictions.collect()[0][0]
print(f"Total convictions = {total}")



+---------+
|    Total|
+---------+
|6447758.0|
+---------+

Total convictions = 6447758.0


In [63]:
import pyspark.sql.functions as func

borough_contribution_pct = borough_conviction_sum_per_borough.withColumn(
                                "% contibution",
                                func.round(borough_conviction_sum_per_borough.convictions / total * 100, 2))\
                                .orderBy(borough_contribution_pct.convictions.desc())\
                                .orderBy(borough_contribution_pct[0].asc())\
                                .orderBy("% contibution", ascending=False)

borough_contribution_pct.show(5)

+-----------+-----------+-------------+
|    borough|convictions|% contibution|
+-----------+-----------+-------------+
|Westminster|   455028.0|         7.06|
|    Lambeth|   292178.0|         4.53|
|  Southwark|   278809.0|         4.32|
|     Camden|   275147.0|         4.27|
|     Newham|   262024.0|         4.06|
+-----------+-----------+-------------+
only showing top 5 rows



In [75]:
convictions_by_month_in_2014 = data.where(data.year == 2014)\
                                    .groupBy("month")\
                                    .agg({"value": "sum"})\
                                    .withColumnRenamed("sum(value)", "monthly_convictions")\
                                    

convictions_by_month_in_2014.orderBy(convictions_by_month_in_2014.monthly_convictions.desc())\
                            .show()




+-----+-------------------+
|month|monthly_convictions|
+-----+-------------------+
|   10|            60537.0|
|   11|            59704.0|
|    7|            58564.0|
|    3|            57669.0|
|   12|            57565.0|
|    6|            57039.0|
|    9|            56933.0|
|    5|            56327.0|
|    8|            55641.0|
|    1|            55515.0|
|    4|            53467.0|
|    2|            51222.0|
+-----+-------------------+

