# Outline
1. There're lots of features for analysis, therefore preprocessing is necessary and important.  
2. This notebook contains Heatmaps and Barplots for EDA.  
3. Also, there're classic regression models for predicting Crime Codes ('Crm Cd' in DataFrame.)

# Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMRegressor

In [None]:
# Load data
crimes = pd.read_csv('../input/crime-in-los-angeles-data-from-2020-to-present/Crime_Data_from_2020_to_Present.csv')
crimes

## Dataset Overview
|     Columns    |                                                                                                                                                    Description                                                                                                                                                   |
|:--------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|      DR_NO     | Division of Records Number: Official file number made up of a 2 digit year, area ID, and 5 digits                                                                                                                                                                                                                |
|    Date Rptd   | MM/DD/YYYY                                                                                                                                                                                                                                                                                                       |
|    DATE OCC    | MM/DD/YYYY                                                                                                                                                                                                                                                                                                       |
|    TIME OCC    | In 24 hour military time.                                                                                                                                                                                                                                                                                        |
|      AREA      | The LAPD has 21 Community Police Stations referred to as Geographic Areas within the department. These Geographic Areas are sequentially numbered from 1-21.                                                                                                                                                     |
|    AREA NAME   | The 21 Geographic Areas or Patrol Divisions are also given a name designation that references a landmark or the surrounding community that it is responsible for. For example 77th Street Division is located at the intersection of South Broadway and 77th Street, serving neighborhoods in South Los Angeles. |
|   Rpt Dist No  | A four-digit code that represents a sub-area within a Geographic Area. All crime records reference the "RD" that it occurred in for statistical comparisons. Find LAPD Reporting Districts on the LA City GeoHub at http://geohub.lacity.org/datasets/c4f83909b81d4786aa8ba8a7                                   |
|    Part 1-2    | Number                                                                                                                                                                                                                                                                                                           |
|     Crm Cd     | Indicates the crime committed. (Same as Crime Code 1)                                                                                                                                                                                                                                                            |
|   Crm Cd Desc  | Defines the Crime Code provided.                                                                                                                                                                                                                                                                                 |
|     Mocodes    | Modus Operandi: Activities associated with the suspect in commission of the crime.See attached PDF for list of MO Codes in numerical order. https://data.lacity.org/api/views/y8tr-7khq/files/3a967fbd-f210-4857-bc52-60230efe256c?download=true&filename=MO%20CODES%20(numerical%20or                           |
| Vict Age       | Two character numeric                                                                                                                                                                                                                                                                                            |
| Vict Sex       | F - Female M - Male X - Unknown                                                                                                                                                                                                                                                                                  |
| Vict Descent   | Descent Code: A - Other Asian B - Black C - Chinese D - Cambodian F - Filipino G - Guamanian H - Hispanic/Latin/Mexican I - American Indian/Alaskan Native J - Japanese K - Korean L - Laotian O - Other P - Pacific Islander S - Samoan U - Hawaiian V - Vietnamese W - White X - Unknown Z - Asian Indian      |
| Premis Cd      | The type of structure, vehicle, or location where the crime took place.                                                                                                                                                                                                                                          |
| Premis Desc    | Defines the Premise Code provided.                                                                                                                                                                                                                                                                               |
| Weapon Used Cd | The type of weapon used in the crime.                                                                                                                                                                                                                                                                            |
| Weapon Desc    | Defines the Weapon Used Code provided.                                                                                                                                                                                                                                                                           |
| Status         | Status of the case. (IC is the default)                                                                                                                                                                                                                                                                          |
| Status Desc    | Defines the Status Code provided.                                                                                                                                                                                                                                                                                |
| Crm Cd 1       | Indicates the crime committed. Crime Code 1 is the primary and most serious one. Crime Code 2, 3, and 4 are respectively less serious offenses. Lower crime class numbers are more serious.                                                                                                                      |
| Crm Cd 2       | May contain a code for an additional crime, less serious than Crime Code 1.                                                                                                                                                                                                                                      |
| Crm Cd 3       | May contain a code for an additional crime, less serious than Crime Code 1.                                                                                                                                                                                                                                      |
| Crm Cd 4       | May contain a code for an additional crime, less serious than Crime Code 1.                                                                                                                                                                                                                                      |
| LOCATION       | Street address of crime incident rounded to the nearest hundred block to maintain anonymity.                                                                                                                                                                                                                     |
| Cross Street   | Cross Street of rounded Address                                                                                                                                                                                                                                                                                  |
| LAT            | Latitude                                                                                                                                                                                                                                                                                                         |
| LON            | Longtitude                                                                                                                                                                                                                                                                                                       |

# Data Preprocessing

In [None]:
# Drop unnecessary columns
crimes.drop(['DR_NO', 'Date Rptd', 'Rpt Dist No' , 'Part 1-2', 'Mocodes', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Cross Street', 'LOCATION'], axis=1, inplace=True)
crimes

In [None]:
# Check whether 'crimes' contains any Null or NaN values
crimes.isnull().sum()

In [None]:
# Drop rows contains missing values
crimes.dropna(axis=0, inplace=True)
crimes.reset_index(drop=True, inplace=True)
crimes

In [None]:
# Split 'DATE OCC' into year, month and day.
crimes['DATE OCC'] = pd.to_datetime(crimes['DATE OCC'])
crimes['YEAR OCC'] = crimes['DATE OCC'].dt.year
crimes['MONTH OCC'] = crimes['DATE OCC'].dt.month
crimes['DAY OCC'] = crimes['DATE OCC'].dt.day
crimes

In [None]:
# Relocate columns
crimes_desc = crimes[['Crm Cd Desc', 'Premis Desc', 'Weapon Desc', 'Status Desc']]
crimes = crimes[['YEAR OCC', 'MONTH OCC', 'DAY OCC', 'TIME OCC', 'AREA', 'AREA NAME', 'Crm Cd', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Weapon Used Cd', 'Status']]
crimes

In [None]:
# Extract data for plotting distribution
crimes_distribution = crimes.iloc[:, [1, 2, 3, 4, 6, 7, 10, 11]]
crimes_distribution

In [None]:
# Check distribution of each features
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

for i, feature in enumerate(crimes_distribution.columns):
    row = int(i/4)
    col = i%4
    sns.distplot(crimes_distribution.iloc[:, i], ax=axs[row][col])

plt.suptitle('Distirbution of features')
plt.tight_layout

## Description
As you can see those plots, features in 'crimes' are much less biased, considering non-preprocessed  
But, 'Crm Cd', 'Premis Cd' and 'Weapon Used Cd' seem to be needed for log transformation  
Additionally, 'Vict Age' which contains value of '0' are meaningless, so dropping those could be a good idea

In [None]:
# Drop 'Vict Age' which value is '0'
Vict_Age_0 = crimes[crimes['Vict Age'] == 0].index
crimes.drop(Vict_Age_0, inplace=True)
crimes.reset_index(drop=True, inplace=True)
crimes

In [None]:
# Log Transformation
Crm_Cd_Log = np.log1p(crimes['Crm Cd'])
Premis_Cd_Log = np.log1p(crimes['Premis Cd'])
Weapon_Used_Cd_Log = np.log1p(crimes['Weapon Used Cd'])

crimes.insert(7, 'Crm Cd Log', Crm_Cd_Log)
crimes.insert(12, 'Premis Cd Log', Premis_Cd_Log)
crimes.insert(14, 'Weapon Used Cd Log', Weapon_Used_Cd_Log)

crimes

In [None]:
# Check distribution of columns log transformation processed
crimes_distribution_log = crimes[['Crm Cd Log', 'Premis Cd Log', 'Weapon Used Cd Log']]
crimes_distribution_log

In [None]:
# Check distribution of each features
fig, axs = plt.subplots(ncols=3, figsize=(15, 5))

for i, feature in enumerate(crimes_distribution_log.columns):
    col = i%3
    sns.distplot(crimes_distribution_log.iloc[:, i], ax=axs[col])

plt.suptitle('Distirbution of features log converted')
plt.tight_layout

# Visualization

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
plt.title('Correlation of features')
sns.heatmap(crimes.corr(), annot=True, linewidths=.5, cmap="YlGnBu")

**Description**

1. We could find relatively big correlation among Crimes, Premis and Weapon Used
2. Therefore, we can those feature as main for training

In [None]:
# Process MinMaxScaling in order to make heatmap
crimes_scaled = crimes.copy()
except_features = ['MONTH OCC', 'AREA NAME', 'Vict Sex', 'Vict Descent', 'Status'] # features on this list will not be scaled
features = np.array(crimes.drop(except_features, axis=1, inplace=False).columns).reshape(-1, 1)

for feature in features:
    scaler = MinMaxScaler()
    scaler.fit(crimes_scaled[feature])
    crimes_scaled[feature] = scaler.transform(crimes_scaled[feature])

crimes_scaled

In [None]:
# Create DataFrame processed groupby on 'Month'
crimes_month = crimes_scaled.groupby(by='MONTH OCC').mean()
crimes_month.drop(['YEAR OCC'], axis=1, inplace=True)
crimes_month

In [None]:
# Plot heatmap
plt.figure(figsize=(10, 8))
plt.title('Correlation of Crimes by Month')
sns.heatmap(crimes_month, annot=True, linewidths=.5, cmap="YlGnBu")

In [None]:
# Plot barplot
crimes_month = crimes_scaled.groupby(by='MONTH OCC', as_index=False).mean()
crimes_month.drop(['YEAR OCC'], axis=1, inplace=True)

sns.set_theme="whitegrid"
f, ax = plt.subplots(figsize=(10, 6))

## Plot 'Crm Cd Log'
sns.set_color_codes("pastel")
sns.barplot(x="MONTH OCC", y="Crm Cd Log", data=crimes_month, label="Crm Cd Log", color="b")

## Plot 'Premis Cd Log'
sns.set_color_codes("muted")
sns.barplot(x="MONTH OCC", y="Premis Cd Log", data=crimes_month, label="Premis Cd Log", color="b")

## Plot 'Weapon Used Cd Log'
sns.set_color_codes("muted")
sns.barplot(x="MONTH OCC", y="Weapon Used Cd Log", data=crimes_month, label="Weapon Used Cd Log", color="g", alpha=0.3)

ax.legend(ncol=3, loc="lower right", frameon=True)
ax.set(ylabel="", xlabel="Month")
plt.title("Total Comparison of Main Features by Month")

plt.show()

**Description**
1. Grouping by month, we could compare features.
2. Especially, on August, crime rates was little bit smaller than any other months.

In [None]:
# Process MinMaxScaling in order to make heatmap
crimes_scaled = crimes.copy()
except_features = ['DAY OCC', 'AREA NAME', 'Vict Sex', 'Vict Descent', 'Status'] # features on this list will not be scaled
features = np.array(crimes.drop(except_features, axis=1, inplace=False).columns).reshape(-1, 1)

for feature in features:
    scaler = MinMaxScaler()
    scaler.fit(crimes_scaled[feature])
    crimes_scaled[feature] = scaler.transform(crimes_scaled[feature])

In [None]:
# Create DataFrame processed groupby on 'Day'
crimes_day = crimes_scaled.groupby(by='DAY OCC').mean()
crimes_day.drop(['YEAR OCC'], axis=1, inplace=True)
crimes_day

In [None]:
# Plot heatmap
plt.figure(figsize=(10, 20))
plt.title('Correlation of Crimes by Day')
sns.heatmap(crimes_day, annot=True, linewidths=.5, cmap="YlGnBu")

In [None]:
# Plot barplot
crimes_day = crimes_scaled.groupby(by='DAY OCC', as_index=False).mean()
crimes_day.drop(['YEAR OCC'], axis=1, inplace=True)

sns.set_theme="whitegrid"
f, ax = plt.subplots(figsize=(20, 10))

## Plot 'Crm Cd Log'
sns.set_color_codes("pastel")
sns.barplot(x="DAY OCC", y="Crm Cd Log", data=crimes_day, label="Crm Cd Log", color="b")

## Plot 'Premis Cd Log'
sns.set_color_codes("muted")
sns.barplot(x="DAY OCC", y="Premis Cd Log", data=crimes_day, label="Premis Cd Log", color="b")

## Plot 'Weapon Used Cd Log'
sns.set_color_codes("muted")
sns.barplot(x="DAY OCC", y="Weapon Used Cd Log", data=crimes_day, label="Weapon Used Cd Log", color="g", alpha=0.3)

ax.legend(ncol=3, loc="lower right", frameon=True)
ax.set(ylabel="", xlabel="DAY")
plt.title("Total Comparison of Main Features by Day")

plt.show()

**Description**

1. As you saw on distribution of 'DAY OCC', the distribution of it was homogeneous.
2. Crime Codes and Weapon Used Codes was also high like previous one.

# Split Datasets

In [None]:
crimes_train = crimes.drop(['AREA NAME', 'Vict Descent', 'Status'] ,axis=1, inplace=False)
crimes_train = pd.get_dummies(crimes_train)
crimes_train

In [None]:
# MinMax Scaling of DataFrame
features = np.array(crimes_train.columns).reshape(-1, 1)

for feature in features:
    scaler = MinMaxScaler()
    scaler.fit(crimes_train[feature])
    crimes_train[feature] = scaler.transform(crimes_train[feature])

crimes_train

In [None]:
# Define features and label for training
train_features = crimes[['YEAR OCC', 'MONTH OCC', 'DAY OCC', 'TIME OCC', 'AREA', 'Vict Age', 'Premis Cd Log', 'Weapon Used Cd Log']]
train_label = crimes['Crm Cd Log'].astype(int)

In [None]:
# Split datasets
X_train, X_test, y_train, y_test = train_test_split(train_features, train_label, test_size=0.2, random_state=11)

print('Shape of X_train: ', X_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of y_test: ', y_test.shape)

# Regression

## Logistic Regression

In [None]:
# Logistic Regression is one of the most fundamental estimator.
# So, let's try with Logistic Regression, first.

lr_reg = LogisticRegression(solver='liblinear')
lr_reg.fit(X_train, y_train)
lr_preds = lr_reg.predict(X_test)
lr_preds_proba = lr_reg.predict_proba(X_test)
lr_mse = mean_squared_error(y_test, lr_preds)
lr_rmse = np.sqrt(lr_mse)

print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(lr_mse, lr_mse))
print('Variance Score : {0:.3f}'.format(r2_score(y_test, lr_preds)))
print('Accuracy Score : {0:.3f}'.format(accuracy_score(y_test, lr_preds)))
print('ROC_AUC Score : {0:.3f}'.format(roc_auc_score(y_test, lr_preds_proba, multi_class='ovr')))

In [None]:
# Optimize for Logistic Regression Model
params = {'penalty' : ['l2', 'l1'], 'C' : [0.01, 0.1, 1, 1, 5, 10]}

grid_lr = GridSearchCV(lr_reg, param_grid=params, scoring='accuracy', cv=3)
grid_lr.fit(train_features, train_label)
print('The Best Hyperparameter for Optimization : {}'.format(grid_lr.best_params_))
print('The Accuracy Score of Averages for Optimization : {0:.3f}'.format(grid_lr.best_score_))

## Various Estimators

In [None]:
# Utility Function
def get_model_cv_prediction(model, X_data, y_target, cv):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv=cv)
    rmse_scores  = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ',model.__class__.__name__ , ' #####')
    print('Average RMSE scores of Cross Validation {0} times : {1:.3f} '.format(cv, avg_rmse))

In [None]:
# Create Regresion estimators
dt_reg = DecisionTreeRegressor(random_state=11, max_depth=4)
rf_reg = RandomForestRegressor(random_state=11, n_estimators=1000)
gb_reg = GradientBoostingRegressor(random_state=11, n_estimators=1000)
lgb_reg = LGBMRegressor(n_estimators=1000)

In [None]:
# Repeat prediction and evaluation
models = [dt_reg, rf_reg, gb_reg, lgb_reg]

for model in models:
    get_model_cv_prediction(model, X_data=train_features, y_target=train_label, cv=5)

In [None]:
# Plot 'feature importances'
rf_reg.fit(train_features, train_label)

feature_series = pd.Series(data=rf_reg.feature_importances_, index=train_features.columns)
feature_series = feature_series.sort_values(ascending=False)
sns.barplot(x= feature_series, y=feature_series.index)
plt.title('Feature Importances by RandomForestRegressor')

# Inspiration

In [None]:
# Which community mostly get effected my crime?
sns.distplot(crimes['AREA'])
print("List of AREA unique values : \n", crimes['AREA'].unique())
print("List of AREA NAME unique values : \n", crimes['AREA NAME'].unique())
print("\n")
print("[ANSWER]")
print("The mostly affected community : 77th Street")

In [None]:
# Which Age of people are most and least affected?
sns.distplot(crimes['Vict Age'])
plt.title('Distribution of Vict Age')
print("[Answer]")
print("People of 30s were most affected to crimes")
print("People of about 70s were less affted to crimes")