### Project EDA

In [1]:
# Suppress Hadoop Info looging
!sed -i 's/hadoop.root.logger=INFO,console/hadoop.root.logger=WARN,console/' /usr/hadoop-3.3.2/etc/hadoop/log4j.properties

In [2]:
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.SparkConf().setAll([('spark.master', 'local[*]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

2023-05-20 00:41:54,422 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load dataframe

df = spark.read.csv("file:///home/work/Crimes_-_2001_to_Present.csv",header=True)

                                                                                

In [4]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Historical Wards 2003-2015: string (nullable = true)
 |-- Zip Codes: string (nullable = true)
 |-- Com

In [5]:
# Limit to data pre-2023

df = df.filter("Year < 2023")

In [6]:
# Check null values

df.count()

                                                                                

7711421

In [7]:
# Drop nulls

df_clean = df.dropna()

In [8]:
# Count again

df_clean.count()

2023-05-20 00:42:30,031 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

6982152

In [9]:
# Iterate how many columns have null values

df.createOrReplaceTempView('crime_view')

cols = [i.name for i in df.schema]
#cols = ['ID']
null_counts = []

for col in cols:
    query = f'''
    SELECT COUNT(*)
    FROM crime_view
    WHERE `{col}` IS NULL
    '''
    null_counts.append(spark.sql(query).collect())
    

                                                                                

In [10]:
null_counts_clean = [null_counts[i][0][0] for i in range(len(null_counts))]

In [11]:
import pandas as pd

null_summary_df = pd.DataFrame(data=cols,columns=['Column Name'])
null_summary_df['Null Rows'] = null_counts_clean
null_summary_df['Fraction of Total Rows'] = [round(null_counts[i][0][0]/total_rows, 4) for i in range(len(null_counts))]

NameError: name 'total_rows' is not defined

In [None]:
null_summary_df.sort_values('Null Rows', ascending=False)

In [None]:
print(7711421-6982152)

In [None]:
# View primary crime types

#df.select('Primary Type').distinct().sort(df['Primary Type'].asc()).show(n=50,truncate=False)

In [None]:
# Reclassify crime types
from pyspark.sql.functions import regexp_replace

new_cats = {'CRIM SEXUAL ASSAULT':'CRIMINAL SEXUAL ASSAULT',
            'NON-CRIMINAL (SUBJECT SPECIFIED)':'NON-CRIMINAL', 
            'NON - CRIMINAL':'NON-CRIMINAL', 
            'OTHER NARCOTIC VIOLATION':'NARCOTICS',
            'PUBLIC INDECENCY':'PUBLIC INDECENCY/OBSCENITY', 
            'OBSCENITY':'PUBLIC INDECENCY/OBSCENITY'}

for i in new_cats:
    df = df.replace(i,new_cats[i],"Primary Type")

In [None]:
# Check number of primary types

df.select("Primary Type").distinct().count()

In [None]:
df.select('Primary Type').distinct().show(n=50,truncate=False)

In [None]:
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import col

df_type_counts = df.groupBy("Primary Type") \
  .agg(countDistinct('Description').alias("Count of Secondary Types")) \
  .orderBy(col("Count of Secondary Types").desc()) \

df_type_counts_pd = df_type_counts.toPandas()

In [None]:
import matplotlib.pyplot as plt 

df_type_counts_pd.plot(kind='bar',x='Primary Type',y='Count of Secondary Types',figsize=(12,5),title="Number of Secondary Types per Primary Type",ylabel="Count",legend=False)

In [None]:
df.agg(countDistinct('Description')).show()