## Extra Libraries Needed

In [None]:
#!pip install -U dataprep

## 1. Loading Libraries

In [None]:
!pip install filename.whl

In [None]:
import geopandas as gpd

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from dataprep.eda import create_report, plot,plot_correlation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## 2. Loading Data

#### 2.1 Loading Crime Datasets

In [None]:
#df_one = pd.read_csv('Crime_Data_from_2010_to_2019.csv',dtype=str)
#df_two = pd.read_csv('Crime_Data_from_2020_to_Present.csv',dtype=str)

In [None]:
#Concating Datasets
#df_one.columns = df_two.columns
#df = pd.concat([df_one, df_two], axis=0,ignore_index =True )
#Saving as parquet file
#df.to_parquet('merged.parquet')

#### 2.2 Loading Crime dataset as parquet file

In [None]:
df = pd.read_parquet('merged.parquet')

In [None]:
df.head()

# 3. Explanatory Data Analysis

### 3.1 Preprocessing

#### 3.1.1 Renaming Columns

In [None]:
df.columns = ['DR_NO', 'Crime Date Reported', 'Crime Date Occured', 'Crime Time Occured', 'Crime Area Code', 'Crime Area Name',
       'Crime Reported Reported Dist No', 'Part 1-2', 'Crime Code', 'Crime Code Desc', 'Mocodes',
       'Victim Age', 'Victim Gender', 'Victim Descent', 'Premis Code', 'Premis Desc',
       'Weapon Used Code', 'Weapon Desc', 'Status', 'Status Desc', 'Crime Code 1',
       'Crime Code 2', 'Crime Code 3', 'Crime Code 4', 'Crime Location', 'Cross Street', 'LAT',
       'LON']

#### 3.1.2 Dropping Unused Columns

In [None]:
df.drop(['DR_NO','Part 1-2'], axis = 1, inplace = True)

#### 3.1.3 Checking Datatypes

In [None]:
df.dtypes

#### 3.1.4 Setting DataTypes According to data

#### 3.1.4.A Converting Dates to DateTime object

In [None]:
#converting date time column from series to datetime data type
df['Crime Date Reported'] = pd.to_datetime(df['Crime Date Reported'],errors='coerce')
df['Crime Date Occured'] = pd.to_datetime(df['Crime Date Occured'],errors='coerce')
#df['Crime Time Occured'] = pd.to_datetime(df['Crime Time Occured'],format= '%H%M',errors='coerce')

#### 3.1.4.B Converting Area to Category object

In [None]:
df['Crime Area Code'] = df['Crime Area Code'].astype('category')
df['Crime Area Name'] = df['Crime Area Name'].astype('category')
df['Crime Reported Reported Dist No'] = df['Crime Reported Reported Dist No'].astype('category')

#### 3.1.4.C Converting Crime Codes to respective datatype

In [None]:
df['Crime Code'] = df['Crime Code'].astype('int64')
df['Crime Code Desc'] = df['Crime Code Desc'].astype('category')
df['Mocodes'] = df['Mocodes'].astype('category')
df['Crime Code 1'].fillna(value=np.nan,inplace =True)
df['Crime Code 2'].fillna(value=np.nan,inplace =True)
df['Crime Code 3'].fillna(value=np.nan,inplace =True)
df['Crime Code 4'].fillna(value=np.nan,inplace =True)
df['Crime Code 1'] = df['Crime Code 1'].astype('float64')
df['Crime Code 2'] = df['Crime Code 2'].astype('float64')
df['Crime Code 3'] = df['Crime Code 3'].astype('float64')
df['Crime Code 4'] = df['Crime Code 4'].astype('float64') 

#### 3.1.4.D Converting Victim Information Features Datatypes

In [None]:
df['Victim Age'] = df['Victim Age'].apply(lambda x: int(x))
df['Victim Gender'] = df['Victim Gender'].astype('category')
df['Victim Descent'] = df['Victim Descent'].astype('category')

#### 3.1.4.E Converting Premis Information Features Datatypes

In [None]:
df['Premis Code'] = df['Premis Code'].astype('category')
df['Premis Desc'] = df['Premis Desc'].astype('category')

#### 3.1.4.F Converting Weapon Information Features Datatypes

In [None]:
df['Weapon Used Code'].fillna(value=np.nan,inplace =True)
df['Weapon Used Code'] = df['Weapon Used Code'].astype('float64')
df['Weapon Desc'] = df['Weapon Desc'].astype('category')

#### 3.1.4.G Converting Status Features Datatypes

In [None]:
df['Status'] = df['Status'].astype('category')
df['Status Desc'] = df['Status Desc'].astype('category')
df['Cross Street'] = df['Cross Street'].astype('str')

## 3.2 Feature Analysis

### 3.2.1 Null Values Analysis

In [None]:
#Percentage of null values in Dataset
((df.isna().sum()) / (df.shape[0]))*100

In [None]:
print('It can be observed that most of the information is available to do the analysis, however, data cleaning will be performed')

#### 3.2.2 Correlation matrix of dataset

In [None]:
df.corr()#.to_clipboard(excel=True, sep=None)

#### 3.2.4 Feature Distribution

In [None]:
plot(df)

#### 3.2.3 Correlation Plots of dataset

In [None]:
sns.pairplot(df)

## 3.3 Feature Engineering

#### 3.3.1 Extracting Day, Month, Year from Crime 'Reported Date'

In [None]:
#getting crime day
df['Crime Reported Day'] = (df['Crime Date Reported'].dt.day).astype(int)
#getting crime month
df['Crime Reported Month'] = (df['Crime Date Reported'].dt.month).astype(int)
#getting crime year
df['Crime Reported Year'] = (df['Crime Date Reported'].dt.year).astype(int)

#### 3.3.2 Extracting Day, Month, Year from Crime 'Date Occured'

In [None]:
#getting crime occured day
df['Crime Occured Day'] = (df['Crime Date Occured'].dt.day).astype(int)
#getting crime occured month
df['Crime Occured Month'] = (df['Crime Date Occured'].dt.month).astype(int)
#getting crime occured year
df['Crime Occured Year'] = (df['Crime Date Occured'].dt.year).astype(int)

#### 3.3.3 Difference of 'Crime Occured' and 'Crime Reported' (in Days)

In [None]:
#New column is added to see the difference of crime occured and reported in days
df['Crime Occured Reported difference (days)'] = (((df['Crime Date Reported']) - (df['Crime Date Occured'])).astype('timedelta64[h]')) / 24

#### 3.3.4 Extracting "Crime Occured Hour" From "Crime Occured Time"

In [None]:
df['Crime Occured Hour'] = df['Crime Time Occured'].apply(lambda x: int(str(x)[:2]))

#### 3.3.5 Making Categories of Crime Time 

In [None]:
df['Crime Occured Time Description'] = df['Crime Occured Hour'].apply(lambda x: 'Dawn' if ((x >= 4) & (x < 6)) else 
                                                                          'Morning' if ((x >= 6) & (x < 11)) else 
                                                                          'Noon' if ((x >= 11) & (x < 14)) else 
                                                                          'After Noon' if ((x >= 14) & (x < 17)) else 
                                                                          'Evening' if ((x >= 17) & (x < 21)) else 
                                                                          'Night' if ((x >= 21) & (x < 23)) else 
                                                                          'Mid Night' if ((x >= 23) & (x < 2)) else 
                                                                          'Late Night')

In [None]:
df.dtypes

#### 3.3.6 Extracting Information From Victim Age 

In [None]:
#converting object to int
df['Victim Age'] = df['Victim Age'].apply(lambda x: int(x))

In [None]:
df.dtypes

#### 3.3.7 Categorizing Victim Age

In [None]:
#defining age bins for respective ages
df['Victim Age Description'] = df['Victim Age'].apply(lambda x: 'Under 18' if ((x >= 1) & (x < 18)) else 
                                                              'Adult' if ((x >= 18) & (x < 30)) else 
                                                              "In 30's" if ((x >= 30) & (x < 40)) else 
                                                              "In 40's" if ((x >= 40) & (x < 50 )) else
                                                              "In 50's" if ((x >= 50) & (x < 60 )) else 
                                                              'Senior Citizen' if ((x > 60))  else 
                                                              'Not Given')

## 3.4 Data Cleaning

#### 3.4.1 Data Cleaning "Victim Gender"  Column

In [None]:
#Replacing X,H,N,- with Unknown in Victim Gender 
df['Victim Gender'].replace(['X', 'H','N','-'], 'Unknown',inplace = True)

#### 3.4.2 Renaming "Victim Descent" Data

In [None]:
#Renaming values
df['Victim Descent'].replace(['H','W','B','A','O','X',"K",'F','C','I','L','P','J','V','U','G','D','S','Z'], ['Hisp|Ltn|Mxcn','White','Black','Other Asian','Other','Unknown','Korea','Filipino','Chinese','AmInd|AlskNtv','Laotian','Islander','Jpn','Vietnam','Unknown','Guaman','Cambod','Samoan','Asn-Indian'],inplace = True)

In [None]:
df.to_parquet('final_p.parquet')

# QUESTIONS

In [None]:
df = pd.read_parquet("final_p.parquet")

# Questions 1:

#### Hypothesis: Child abuse (Physically, mentally, sexually) and child trafficing is increasing all over the world . 
Child Traffecking reference link: https://www.iom.int/sites/g/files/tmzbdl486/files/our_work/DMM/MAD/A4-Trafficking-External-Brief.pdf <br>
LA Reference link: https://www.dailynews.com/2019/10/20/la-police-failed-to-investigate-4000-serious-child-abuse-reports-in-2018-and-2019-why/
## Question: What are the trends of child crimes and their reporting time difference in LA. 

In [None]:
plt.figure(figsize=(18,9))
sns.countplot(x = 'Victim Gender', hue = 'Victim Age Description', data = df)

In [None]:
plt.figure(figsize=(18,9))
sns.barplot(x = 'Victim Gender', y = 'Crime Occured Reported difference (days)', hue = 'Victim Age Description',data = df)

## Answer
##### It has been observed that there are most crimes are being conducted with Male-Female Adults. Crimes with underage people is not in top 4 genders being affected. However, if we analyze the Crime Occured and Crime Reporting time difference. Underage Females Victims are leading the trends. It shows that underage females are reluctent to report the crime. There is possibility that their crime is being reported when their parents know about it and they themselve report it.

# Question 2:

#### Hypothesis: Voilent crimes are most likely to happen at night.
reference link: https://www.securitymagazine.com/articles/90384-murder-robbery-and-driving-while-impaired-happen-at-night
#### Question : Does Crimes in LA most happen at night. Further, are females most likely to get affected by crime at night?

## Victim's Gender vs Time

In [None]:
plt.figure(figsize=(18,7))
sns.countplot(x = df['Crime Occured Time Description'],hue= df['Victim Gender'],order = df['Crime Occured Time Description'].value_counts().index)

##### Most crimes are likely to happen at Evening and at Late night, this varifies the constructed hypothesis. However, females are almost equally being affected by crime. Therefore, there is no direct relation of females specifically affected by time in LA.

# Question 3

### Hypothesis: After COVID-19, there is increase in crime due to daily wagers loosing their jobs because of lockdown. 
reference link: https://www.dailynews.com/2021/08/22/report-coronavirus-pandemic-key-factor-in-las-spiking-homicide-rate/
### Question: Is this hypothesis true for LA?

In [None]:
df_lockdown = df[df['Crime Date Occured']>'2020-01-01']

In [None]:
plot(df_lockdown,'Crime Date Occured')

#####  The hypothesis: "After COVID-19, there is increase in crime due to daily wagers loosing their jobs because of lockdown" is not true for LA. As there is amlost similar distribution before, during and after lockdown. Therefore constructed hypothesis is false.

# Question 4
#### Hypothesis Criminals are likely to commit crimes close to their living place
https://core.ac.uk/download/pdf/232845703.pdf
#### Question: What is the relationship between Crime places and criminal arrests? 

# Wordcloud

In [None]:
df['Cross Street'].iloc[:8]

In [None]:
# Iterating through the .csv data file 
data = pd.read_csv('final.csv',encoding ="latin-1")

In [None]:
for i in data.CONTENT: 
    i = str(i) 
    separate = i.split() 
    for j in range(len(separate)): 
        separate[j] = separate[j].lower() 
      
    comment_words += " ".join(separate)+" "

In [None]:
from wordcloud import WordCloud, STOPWORDS
data = df['Cross Street'].value_counts().to_dict()
wc = WordCloud().generate_from_frequencies(data)

plt.imshow(wc)
plt.axis('off')
plt.show()

# Question 5
#### Hypothesis: Winter stops crime 
https://www.nbcnews.com/news/us-news/does-cold-stop-crime-it-seems-so-n309856.
#### Question: What is the relationship of Winters with the crimes? 

In [None]:
df_lockdown = df[df['Crime Date Occured']>'2020-01-05']
plot(df_lockdownTwo,'Crime Date Occured')

In [None]:
#Ploting Age Description with Gender
plt.figure(figsize=(8,4))
sns.countplot(x = 'Victim Age Description', hue = 'Victim Gender', data = df,order = df['Victim Age Description'].value_counts().index )