In [1]:
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from pyspark.ml.stat import Correlation

import numpy as np

In [2]:
%%time

# hdfs_port = "hdfs://orion11:26990"
# hdfs_path = "/FL_insurance_sample.csv"

hdfs_port = "hdfs://orion11:13030"
hdfs_path = "/crime-since-2001-chicago.csv"
df = spark.read.format('csv').option("header", "true").load(hdfs_port + hdfs_path)

CPU times: user 3.91 ms, sys: 0 ns, total: 3.91 ms
Wall time: 7.72 s


In [3]:
df.columns

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [18]:
%%time

df.createOrReplaceTempView('crime')
temp = spark.sql(
    '''
    SELECT
        `Primary Type`,
        AVG(LENGTH(Description)) as Desclen
    FROM crime
    GROUP BY `Primary Type`
    ORDER by AVG(LENGTH(Description)) 
    '''
)

temp.show(20)

+--------------------+------------------+
|        Primary Type|           Desclen|
+--------------------+------------------+
|            STALKING| 8.163162570328694|
|               ARSON| 8.750402864816472|
|   CRIMINAL TRESPASS|  9.71141442153263|
|            GAMBLING| 9.887764298093588|
|     CRIMINAL DAMAGE| 11.21564645397782|
|             ASSAULT| 11.85810915560983|
|        INTIMIDATION|12.120182555780932|
| MOTOR VEHICLE THEFT|12.810302908955693|
|               THEFT|13.092669076617286|
|            BURGLARY|14.338210461285009|
|PUBLIC PEACE VIOL...|14.777031972063652|
| CRIM SEXUAL ASSAULT| 15.54262053735738|
|    PUBLIC INDECENCY|              16.0|
|        NON-CRIMINAL|16.389221556886227|
|           OBSCENITY| 16.52920962199313|
|   DOMESTIC VIOLENCE|              17.0|
|      NON - CRIMINAL|              17.0|
|             BATTERY|17.320713808260244|
|CONCEALED CARRY L...| 17.73356401384083|
|OFFENSE INVOLVING...|18.814462709887547|
+--------------------+------------

In [21]:
%%time

df.createOrReplaceTempView('crime')
temp = spark.sql(
    '''
    SELECT
        `Primary Type`,
        AVG(LENGTH(Description)) as Desclen
    FROM crime
    GROUP BY `Primary Type`
    ORDER by AVG(LENGTH(Description)) Desc
    '''
)

temp.show(20)

+--------------------+------------------+
|        Primary Type|           Desclen|
+--------------------+------------------+
|NON-CRIMINAL (SUB...|              38.0|
|           RITUALISM| 32.91304347826087|
|LIQUOR LAW VIOLATION|27.092963752665245|
|INTERFERENCE WITH...| 25.73384230617349|
|   WEAPONS VIOLATION| 25.03850819533388|
|           NARCOTICS|22.794112975657203|
|          KIDNAPPING| 22.28717026378897|
|OTHER NARCOTIC VI...|              22.0|
|  DECEPTIVE PRACTICE| 21.70683557767184|
|       OTHER OFFENSE| 21.62868365134897|
|        PROSTITUTION|20.607372732592157|
|         SEX OFFENSE|20.524667780695474|
|   HUMAN TRAFFICKING|19.897959183673468|
|             ROBBERY|19.515808794813715|
|            HOMICIDE|18.993972081218274|
|OFFENSE INVOLVING...|18.814462709887547|
|CONCEALED CARRY L...| 17.73356401384083|
|             BATTERY|17.320713808260244|
|   DOMESTIC VIOLENCE|              17.0|
|      NON - CRIMINAL|              17.0|
+--------------------+------------

Top Five Least descriptive crimes are as follows all averaging descriptions with less than 10 characters

- Stalking
- Arson
- Trespassing
- Grambling 
- Damage

While the some most descriptive crimes are

- Ritualism
- Liquor Law Violaition
- Weapons Violation
- Kidnapping
- Deceptive Practice

In [22]:
%%time

df.createOrReplaceTempView('crime')
temp = spark.sql(
    '''
    SELECT
        Year,
        `Primary Type`,
        AVG(LENGTH(Description)) as Desclen
    FROM crime
    GROUP BY Year, `Primary Type`
    ORDER by AVG(LENGTH(Description)) 
    '''
)

temp.show(20)

+----+------------+------------------+
|Year|Primary Type|           Desclen|
+----+------------+------------------+
|2002|    STALKING|              6.24|
|2001|    STALKING| 6.315270935960591|
|2003|    STALKING| 6.315789473684211|
|2004|    STALKING|6.3534883720930235|
|2005|    STALKING| 6.385416666666667|
|2007|    STALKING| 6.446009389671362|
|2008|    STALKING|6.4526315789473685|
|2006|    STALKING| 6.618279569892473|
|2010|    STALKING| 6.656084656084656|
|2009|    STALKING| 6.748502994011976|
|2014|       ARSON|  8.23425692695214|
|2017|       ARSON| 8.236486486486486|
|2018|       ARSON| 8.249258160237389|
|2016|       ARSON|               8.5|
|2015|       ARSON|  8.54083885209713|
|2003|       ARSON| 8.590575916230366|
|2012|       ARSON| 8.607675906183369|
|2009|       ARSON| 8.691558441558442|
|2011|    STALKING| 8.756906077348066|
|2011|       ARSON| 8.757936507936508|
+----+------------+------------------+
only showing top 20 rows

CPU times: user 3.01 ms, sys: 1.02 ms,

Grouping by year we can see that the relative descriptions over time seem to be relativly consistent with little difference. One can probably expect that the description for these crimes are also pretty standard.

In [34]:
%%time

df.createOrReplaceTempView('crime')
temp = spark.sql(
    '''
    SELECT `Primary Type`,(Max(Desclen)- Min(Desclen)) as diff FROM(
    SELECT
        Year,
        `Primary Type`,
        AVG(LENGTH(Description)) as Desclen
    FROM crime
    GROUP BY Year, `Primary Type`
    ORDER by AVG(LENGTH(Description))) as t1
    Group by `Primary Type`
    ORDER BY Max(Desclen) - Min(Desclen) Desc
    '''
)

temp.show(20)

+--------------------+--------------------+
|        Primary Type|                diff|
+--------------------+--------------------+
|   DOMESTIC VIOLENCE|                 0.0|
|NON-CRIMINAL (SUB...|                 0.0|
|OTHER NARCOTIC VI...|                 0.0|
|      NON - CRIMINAL|                 0.0|
|    PUBLIC INDECENCY|                 0.0|
|            HOMICIDE|0.041867102958306646|
|            BURGLARY| 0.19814330609798247|
|        PROSTITUTION|  0.6551869135384933|
|               ARSON|  0.8289614638524583|
|         SEX OFFENSE|  0.8764401243770799|
|     CRIMINAL DAMAGE|  0.8977967307017334|
|               THEFT|  1.2193330903790098|
|             ROBBERY|  1.2684514692885678|
|   WEAPONS VIOLATION|    1.40473127436454|
|             ASSAULT|  1.4538753455453044|
|   CRIMINAL TRESPASS|  1.5165586230259542|
| CRIM SEXUAL ASSAULT|  1.5936040763993926|
|        INTIMIDATION|   1.633758929928561|
|   HUMAN TRAFFICKING|  1.6666666666666679|
| MOTOR VEHICLE THEFT|  1.911913

In [36]:
%%time

df.createOrReplaceTempView('crime')
temp = spark.sql(
    '''
    SELECT `Primary Type`,(Max(Desclen)- Min(Desclen)) as diff FROM(
    SELECT
        Year,
        `Primary Type`,
        AVG(LENGTH(Description)) as Desclen
    FROM crime
    GROUP BY Year, `Primary Type`
    ORDER by AVG(LENGTH(Description))) as t1
    Group by `Primary Type`
    ORDER BY Max(Desclen) - Min(Desclen) Desc
    '''
)

temp.show(20)

+--------------------+------------------+
|        Primary Type|              diff|
+--------------------+------------------+
|           RITUALISM|              14.5|
|             BATTERY|  9.31719109116828|
|  DECEPTIVE PRACTICE| 7.610527222435543|
|        NON-CRIMINAL| 6.117647058823529|
|            STALKING| 5.993766233766234|
|CONCEALED CARRY L...| 5.180392156862746|
|           OBSCENITY|4.4411764705882355|
|OFFENSE INVOLVING...| 4.089830508474577|
|          KIDNAPPING| 3.783653760617156|
|           NARCOTICS| 3.651637244095305|
|       OTHER OFFENSE|2.7779403523977386|
|PUBLIC PEACE VIOL...|  2.62886029577359|
|            GAMBLING|2.5071414957902167|
|INTERFERENCE WITH...|2.3947280935988573|
|LIQUOR LAW VIOLATION|2.1737830670436367|
| MOTOR VEHICLE THEFT|1.9119132344166108|
|   HUMAN TRAFFICKING|1.6666666666666679|
|        INTIMIDATION| 1.633758929928561|
| CRIM SEXUAL ASSAULT|1.5936040763993926|
|   CRIMINAL TRESPASS|1.5165586230259542|
+--------------------+------------

## Analysis

Over time it seems that domistic violence cases seem to not change that much but the description for Ritualism seems to increase over time. Suprisingly there are no crimes that get LESS descriptive over time. It seems that more complex crimes tend to be more descriptive over time. 
