## Data Transformations

### Libraries import and data ingestions

In [1]:
from common_libraries import * 
import project_function

In [2]:
logger = logging.getLogger("pyspark")
# Filter the NativeCodeLoader warning
logger.addFilter(lambda record: "WARN NativeCodeLoader" not in record.getMessage())  
# Filter the GarbageCollectionMetrics warning
logger.addFilter(lambda record: "WARN GarbageCollectionMetrics" not in record.getMessage())  
# creating a spark session

spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.sql.debug.catalog", False) \
    .config("spark.logLevel", "ERROR") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "4g") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/04/10 22:20:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/04/10 22:20:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/04/10 22:20:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
df = spark.read.csv("/Users/pasqualesalomone/Desktop/crime_data.csv", header=True, inferSchema=True)


                                                                                

In [4]:
df.columns

['DR_NO',
 'Date Rptd',
 'DATE OCC',
 'TIME OCC',
 'AREA',
 'AREA NAME',
 'Rpt Dist No',
 'Part 1-2',
 'Crm Cd',
 'Crm Cd Desc',
 'Mocodes',
 'Vict Age',
 'Vict Sex',
 'Vict Descent',
 'Premis Cd',
 'Premis Desc',
 'Weapon Used Cd',
 'Weapon Desc',
 'Status',
 'Status Desc',
 'Crm Cd 1',
 'Crm Cd 2',
 'Crm Cd 3',
 'Crm Cd 4',
 'LOCATION',
 'Cross Street',
 'LAT',
 'LON']

### Using lower case snake notation to standardize column names

In [5]:
column_snake_case_lambda = lambda s: re.sub(r'\W+', ' ', s).lower().replace(' ', '_')


In [6]:
new_columns = [column_snake_case_lambda(col_name) for col_name in df.columns]

# Rename the columns in the DataFrame
df = df.toDF(*new_columns)

In [7]:
df.dtypes

[('dr_no', 'int'),
 ('date_rptd', 'string'),
 ('date_occ', 'string'),
 ('time_occ', 'int'),
 ('area', 'int'),
 ('area_name', 'string'),
 ('rpt_dist_no', 'int'),
 ('part_1_2', 'int'),
 ('crm_cd', 'int'),
 ('crm_cd_desc', 'string'),
 ('mocodes', 'string'),
 ('vict_age', 'int'),
 ('vict_sex', 'string'),
 ('vict_descent', 'string'),
 ('premis_cd', 'int'),
 ('premis_desc', 'string'),
 ('weapon_used_cd', 'int'),
 ('weapon_desc', 'string'),
 ('status', 'string'),
 ('status_desc', 'string'),
 ('crm_cd_1', 'int'),
 ('crm_cd_2', 'int'),
 ('crm_cd_3', 'int'),
 ('crm_cd_4', 'int'),
 ('location', 'string'),
 ('cross_street', 'string'),
 ('lat', 'double'),
 ('lon', 'double')]

24/04/10 22:21:05 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


### Converting data types

In [8]:
df = (df.withColumn("date_rptd", 
                   to_date(concat_ws("-", split(substring(col('date_rptd'),1,10), "/")[2], 
                              split(substring(col('date_rptd'),1,10), "/")[0], 
                              split(substring(col('date_rptd'),1,10), "/")[1]),'yyyy-mm-dd'))\
      .withColumn("date_occ", 
                   to_date(concat_ws("-", split(substring(col('date_occ'),1,10), "/")[2], 
                              split(substring(col('date_occ'),1,10), "/")[0], 
                              split(substring(col('date_occ'),1,10), "/")[1]),'yyyy-mm-dd'))\

      .withColumn('time_occ',col('time_occ').cast('string'))
      .withColumn('weapon_used_cd', col('weapon_used_cd').cast('string'))
      .withColumn('premis_cd', col('premis_cd').cast('string'))
      .withColumn('crm_cd', col('crm_cd').cast('string'))
      .withColumn('area', col('area').cast('string'))
      .withColumn('crm_cd_1', col('crm_cd_1').cast('string'))
      .withColumn('crm_cd_2', col('crm_cd_2').cast('string'))
      .withColumn('crm_cd_3', col('crm_cd_3').cast('string'))
      .withColumn('crm_cd_4', col('crm_cd_4').cast('string')))

In [9]:
df.dtypes

[('dr_no', 'int'),
 ('date_rptd', 'date'),
 ('date_occ', 'date'),
 ('time_occ', 'string'),
 ('area', 'string'),
 ('area_name', 'string'),
 ('rpt_dist_no', 'int'),
 ('part_1_2', 'int'),
 ('crm_cd', 'string'),
 ('crm_cd_desc', 'string'),
 ('mocodes', 'string'),
 ('vict_age', 'int'),
 ('vict_sex', 'string'),
 ('vict_descent', 'string'),
 ('premis_cd', 'string'),
 ('premis_desc', 'string'),
 ('weapon_used_cd', 'string'),
 ('weapon_desc', 'string'),
 ('status', 'string'),
 ('status_desc', 'string'),
 ('crm_cd_1', 'string'),
 ('crm_cd_2', 'string'),
 ('crm_cd_3', 'string'),
 ('crm_cd_4', 'string'),
 ('location', 'string'),
 ('cross_street', 'string'),
 ('lat', 'double'),
 ('lon', 'double')]

### Renaming dr_no column to crime_record_id

In [10]:
df = df.withColumnRenamed('dr_no','crime_record_id')

### Creating a new column days_to_report

In [11]:
df = df.withColumn('column_days_to_report', datediff(
                                                col('date_rptd'),col('date_occ')))
df.show(2,vertical=True)

24/04/10 22:21:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
-RECORD 0-------------------------------------
 crime_record_id       | 190326475            
 date_rptd             | 2020-01-01           
 date_occ              | 2020-01-01           
 time_occ              | 2130                 
 area                  | 7                    
 area_name             | Wilshire             
 rpt_dist_no           | 784                  
 part_1_2              | 1                    
 crm_cd                | 510                  
 crm_cd_desc           | VEHICLE - STOLEN     
 mocodes               | null                 
 vict_age              | 0                    
 vict_sex              | M                    
 vict_descent          | O                    
 premis_cd             | 101                  
 premis_desc           | STREET               
 weapon_used_cd      

### Dropping columns with a nulls ratio grater than 60%

In [12]:
project_function.nulls_buster(df,spark)

                                                                                

Unnamed: 0,column_name,%null
0,crm_cd_4,99.993086
1,crm_cd_3,99.755866
2,crm_cd_2,92.736789
3,cross_street,84.312103
4,weapon_used_cd,65.464611
5,weapon_desc,65.464611
6,mocodes,13.985006
7,vict_descent,13.305103
8,vict_sex,13.304023
9,premis_desc,0.060385


In [13]:
df = df.drop('weapon_desc','weapon_used_cd','cross_street','crm_cd_2','crm_cd_3','crm_cd_4')

In [16]:
#df.groupBy('vict_descent').count().orderBy('count', ascending=False).first()['vict_descent']
   

In [14]:
df = project_function.impute_missing_values(df, [
        "mocodes",
        "vict_descent",
        "vict_sex",
        "premis_desc",
        "premis_cd",
        "crm_cd_1"
    ])

                                                                                

In [17]:

#df.select(sum(col('mocodes').isNull().cast('int'))).collect()[0][0]
#df.filter((F.col('mocodes').isNull()) | (F.col('mocodes') == '')).count()
#df.select(col('mocodes')).show()

In [15]:
project_function.nulls_buster(df,spark)

                                                                                

Unnamed: 0,column_name,%null


In [57]:
#df.groupBy('mocodes').agg(F.count('*').alias('count')).orderBy('count',ascending=False).show()


In [15]:
categorical_columns = [df.dtypes[value][0] for value in range(0,len(df.columns)) if df.dtypes[value][1]=='string']

In [16]:
numeric_columns = [df.dtypes[value][0] for value in range(0,len(df.columns)) if df.dtypes[value][1]=='int']