### Objectifs de ce notebook :
>Utilisation des données du site https://data.cityofchicago.org/ sur la criminalité ainsi que des données  socio-économiques du ministère de la Santé de Chicago https://data.cityofchicago.org/Health-Human-Services/Census-Data-Selected-socioeconomic-indicators-in-C/kn9c-c2s2.
- Extraction des features
- Label ou taget (varialbe à prédire) : nombre de crime par (mois, région et type)
- Implémentation d'algorithme de machine learning sur l'annés 2012 et application sur l'année 2013
- Visualisation des résultats

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import train_test_split

In [3]:
Path_Socio = "/home/ml/Documents/crimes_chigaco/data/raw/Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv"
Path_Crime = "/home/ml/Documents/crimes_chigaco/data/raw/Crimes_-_2001_to_present.csv"

In [4]:
df_Socio = pd.read_csv(Path_Socio)
df_Crime = pd.read_csv(Path_Crime, sep=';')

In [5]:
def rename_columns_socio():
    """
    
    """
    return {
'Community Area Number':'community_area_number',
'COMMUNITY AREA NAME':'community_area_name', 
'PERCENT OF HOUSING CROWDED':'pct_housing_crowded', 
'PERCENT HOUSEHOLDS BELOW POVERTY':'pct_households_below_poverty', 
'PERCENT AGED 16+ UNEMPLOYED':'pct_age16_unemployed',
'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA':'pct_age25_no_highschool',
'PERCENT AGED UNDER 18 OR OVER 64': 'pct_not_working_age',
'per_capita_income':'per_capita_income',
'HARDSHIP INDEX' : 'hardship_index'}

def rename_columns_crimes():
    """
    
    """
    return {
        'ID': 'id',
        'Case Number': 'cas_number', 
        'Date':'date',
        'Block':'block', 
        'IUCR':'iucr', 
        'Primary Type':'primary_type',
        'Description':'description', 
        'Location Description':'location_description', 
        'Arrest':'arrest', 
        'Domestic':'domestic', 
        'Beat':'beat',
        'District':'district', 
        'Ward':'ward', 
        'Community Area': 'community_area_number', 
        'FBI Code':'fbi_code', 
        'X Coordinate':'x_coordinate',
        'Y Coordinate':'y_coordinate', 
        'Year':'year', 
        'Updated On':'updated_on', 
        'Latitude':'latitude', 
        'Longitude':'longitude',
        'Location':'location'   
    }

df_Socio.rename(columns=rename_columns_socio(), inplace = True)
df_Crime.rename(columns= rename_columns_crimes(), inplace= True)

In [6]:
def visualisation_prediction(y_test, y_pred):
    
    matplotlib.rc('xtick', labelsize=30) 
    matplotlib.rc('ytick', labelsize=30) 
    fig, ax = plt.subplots(figsize=(50, 40))
    plt.style.use('ggplot')
    plt.plot(y_pred, y_test, 'ro')
    plt.xlabel('Predicted Crime', fontsize = 30)
    plt.ylabel('Actual Crime', fontsize = 30)
    plt.title('Predicted Y (Crimes) to the Actual Y (Crimes)', fontsize = 30)
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)

In [7]:
def extract_features(year, df_S, df_C):
    """
    
    """
    list_name_crimes = list(df_C['primary_type'].unique())
    df_year = df_C[df_C['year']==year]
    df_year['month']=pd.DatetimeIndex(df_year['date']).month
    df_year_grouped = df_year.groupby(['community_area_number', 'month', 'primary_type'],\
                                      as_index=False).agg({'id':'count'})
    df_year_grouped.rename(columns={'id':'nb_crimes'}, inplace=True)
    df_merged = pd.merge(df_year_grouped, df_S, on ='community_area_number', how='inner')
    df_merged.dropna(inplace=True)
    df_features = pd.get_dummies(df_merged)
    for col in list_name_crimes:
        if "primary_type_" + col not in list(df_features.columns):
            df_features["primary_type_" + col]=-1
            
    return df_features

In [8]:
df_features = extract_features(2012, df_S=df_Socio, df_C=df_Crime)

In [9]:
df_features.shape

(15667, 122)

In [10]:
df_features.sample(5)

Unnamed: 0,community_area_number,month,nb_crimes,pct_housing_crowded,pct_households_below_poverty,pct_age16_unemployed,pct_age25_no_highschool,pct_not_working_age,PER CAPITA INCOME,hardship_index,...,community_area_name_West Lawn,community_area_name_West Pullman,community_area_name_West Ridge,community_area_name_West Town,community_area_name_Woodlawn,primary_type_HUMAN TRAFFICKING,primary_type_CONCEALED CARRY LICENSE VIOLATION,primary_type_NON - CRIMINAL,primary_type_RITUALISM,primary_type_DOMESTIC VIOLENCE
642,3.0,10,1,3.8,24.0,8.9,11.8,22.2,35787,20.0,...,0,0,0,0,0,-1,-1,-1,-1,-1
1069,6.0,2,1,1.1,11.4,4.7,2.6,17.0,60058,5.0,...,0,0,0,0,0,-1,-1,-1,-1,-1
8774,43.0,10,172,2.8,31.1,20.0,14.0,35.7,19398,55.0,...,0,0,0,0,0,-1,-1,-1,-1,-1
423,2.0,10,12,7.8,17.2,8.8,20.8,38.5,23040,46.0,...,0,0,1,0,0,-1,-1,-1,-1,-1
5525,28.0,1,12,3.8,20.6,10.7,9.6,22.2,44689,15.0,...,0,0,0,0,0,-1,-1,-1,-1,-1


## Création d'un modèle entrainé sur les données de l'année 2012

### Données utilisées

In [None]:
X_2012 = df_features[df_features.columns.difference(['nb_crimes'])]
y_2012 = df_features[['nb_crimes']]

### Création du modèle ainsi que son entrainement

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 30, alpha = 10, n_estimators = 500)
xg_reg.fit(X_2012,y_2012)

### On prédit le nombre de crimes sur l'année  2013

##### Extraction des features sur l'année 2013 

In [None]:
df_features_2013=extract_features(2013, df_S=df_Socio, df_C=df_Crime)
X_2013 = df_features_2013[df_features_2013.columns.difference(['nb_crimes'])]
y_2013 = df_features_2013[['nb_crimes']]

### Prédiction sur l'année 2013

In [None]:
y_pred_2013 = xg_reg.predict(X_2013)

### Visualisation des résultats

In [None]:
visualisation_prediction(y_pred_2013, y_2013)

### Score : coefficient de détermination

In [None]:
print( "score : coefficient de détermination est : " + str(r2_score(y_2013, y_pred_2013)))

## Dans cette partie nous allons améliorer le score en optimisant les hyper-paramètres de XGBOOST

In [None]:
xg_reg_optim = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.25, learning_rate = 0.07,
                max_depth = 11, alpha = 8, n_estimators = 250,n_jobs=-1)
xg_reg_optim.fit(X_2012,y_2012)

In [None]:
y_pred_2013_optim = xg_reg_optim.predict(X_2013)

In [None]:
print( "score : coefficient de détermination est : " + str(r2_score(y_2013, y_pred_2013_optim)))

In [None]:
visualisation_prediction(y_pred_2013_optim, y_2013)

In [None]:
xgb.plot_importance(xg_reg_optim)
plt.rcParams['figure.figsize'] = [10,15]

In [None]:
!pip install astral

In [None]:
### Dans cette partie nous allons améliorer la précision de l'algorithme en enrichissant les features

df_year = df_Crime[df_Crime['year']==2012]

liste_utilise = ['id',
 'date',
 'block',
 'primary_type',
 'community_area_number']

df_year = df_year [liste_utilise]

df_year['date'] = pd.to_datetime(df_year['date'])

df_year['month']=df_year['date'].dt.month

df_year['day'] = df_year['date'].dt.day

df_year['dayofweek']=df_year['date'].dt.dayofweek

#df_year['type_block'] = df_year['block'].apply(lambda x : x.split(" ")[-1])

df_with_day = pd.get_dummies(df_year, columns=['dayofweek', 'day']).\
groupby(['community_area_number', 'month', 'primary_type'],\
                                                       as_index=False).count()
df_features_all = pd.merge(df_features, df_with_day, on = ['community_area_number', 'month'], how = 'left')
df_features_all.drop_duplicates(inplace = True)

In [None]:
df_features_all.drop(['primary_type', 'id'], axis=1, inplace=True)

In [None]:
X_2012_all = df_features_all[df_features_all.columns.difference(['nb_crimes'])]
y_2012_all = df_features_all[['nb_crimes']]

In [None]:
X_2012_all.columns.tolist()

In [None]:
%%time
xg_reg_all = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 200,n_jobs=-1)
xg_reg_all.fit(X_2012_all ,y_2012_all)

## Prédiction sur l'année 2013

In [None]:
df_features_2013 = extract_features(2013, df_S=df_Socio, df_C=df_Crime)

In [None]:
df_year_2013 = df_Crime[df_Crime['year']==2013]

liste_utilise = ['id',
 'date',
 'block',
 'primary_type',
 'community_area_number']

df_year_2013 = df_year_2013[liste_utilise]

df_year_2013['date'] = pd.to_datetime(df_year_2013['date'])

df_year_2013['month']=df_year_2013['date'].dt.month

df_year_2013['day'] = df_year_2013['date'].dt.day

df_year_2013['dayofweek']=df_year_2013['date'].dt.dayofweek

#df_year['type_block'] = df_year['block'].apply(lambda x : x.split(" ")[-1])

df_with_day_2013 = pd.get_dummies(df_year_2013, columns=['dayofweek', 'day']).\
groupby(['community_area_number', 'month', 'primary_type'],\
                                                       as_index=False).count()
df_features_all_2013 = pd.merge(df_features_2013, df_with_day_2013, on = ['community_area_number', 'month'], how = 'left')
df_features_all_2013.drop_duplicates(inplace = True)

In [None]:
df_year_2013 = df_Crime[df_Crime['year']==2013]

In [None]:
df_year_2013.sample(5)

In [None]:
liste_utilise = ['id',
 'date',
 'block',
 'primary_type',
 'community_area_number']

In [None]:
df_year_2013 = df_year_2013[liste_utilise]

In [None]:
df_year_2013

In [None]:
df_year_2013['date'] = pd.to_datetime(df_year_2013['date'])

In [None]:

df_year_2013['month']=df_year_2013['date'].dt.month

df_year_2013['day'] = df_year_2013['date'].dt.day

df_year_2013['dayofweek']=df_year_2013['date'].dt.dayofweek

In [None]:
df_year_2013.head()

In [None]:
df_features_2013.head(5)

In [None]:
df_features_all_2013.sample(5)

In [None]:
X_2013 = df_features_all_2013[df_features_all_2013.columns.difference(['nb_crimes'])]
y_2013 = df_features_all_2013[['nb_crimes']]

In [None]:
X_2013.drop(['primary_type','id'], axis=1,inplace=True)

In [None]:
y_pred_2013=xg_reg_all.predict(X_2013)

In [None]:
visualisation_prediction(y_pred_2013, y_2013)

In [None]:
print( "score : coefficient de détermination est : " + str(r2_score(y_2013, y_pred_2013)))