In [1]:
import os

import pandas as pd
import geopandas as gpd
from itertools import product

from dateutil.relativedelta import relativedelta

from joblib import load
import pickle

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.set_option('display.max_rows', 1000)
pd.options.display.max_rows = 4000

In [2]:
## Define your path
PATH = r"D:\francisco_madrigal\Desktop\Tesis"

def create_path(file_path, path=PATH):
    
    return os.path.join(path, file_path)


In [3]:
## Create a list og columns that we are going to use first to construct the grid
col_list = ['Hora', 'id_colonia', 'month','no_labor_day',
            'dia_semana', 'day_period', 'area_km2', 'metro', 'metrobus', 'rtp', 'supers_minisupers',
            'department_stores', 'banks', 'corporate_offices', 'restaurants', 'g_edu', 'no_healt_s',
            'house_per_km2', 'past_crimes', 'past_near_crimes_500mts', 'TEMP', 'PRCP', 'CO', 'O3', 'PM10']

matrix_crimes = pd.read_csv(create_path(r"modelo\\base_crimenes.csv"),
                            usecols=col_list,
                            parse_dates=['Hora'])

In [4]:
## Create a grid with all combinations posibles
grid = matrix_crimes[['id_colonia','day_period','month','dia_semana','no_labor_day']]

unique_values = [grid[i].unique().tolist() for i in grid.columns ]

grid = pd.DataFrame(product(*unique_values), columns = grid.columns)

In [5]:
## Use the monthly mean of the climate factors
gpp = matrix_crimes.groupby('month')[['TEMP', 'PRCP', 'CO', 'O3', 'PM10']].mean().reset_index()

grid = grid.merge(gpp, on='month',
                  how='left')

In [6]:
## Crate a pair of columns with the mean of past crimes in the last 3 months to use in the grid
test_cut = matrix_crimes['Hora'].max() - relativedelta(months=3)

train_cut = test_cut - relativedelta(months=3)  

pastncr = matrix_crimes.query('Hora >= @train_cut and Hora <= @test_cut')\
    .groupby('id_colonia')['past_near_crimes_500mts'].mean().round().reset_index()

pastcr = matrix_crimes.query('Hora >= @train_cut and Hora <= @test_cut')\
    .groupby('id_colonia')['past_crimes'].mean().round().reset_index()

pastcr.columns = ['id_colonia', 'past_crimes']
pastncr.columns = ['id_colonia', 'past_near_crimes_500mts']

grid = grid.merge(pastncr, on='id_colonia',
                                    how='left')

grid = grid.merge(pastcr, on='id_colonia',
                                    how='left')

grid['covid_dummy'] = 0

In [7]:
matrix_crimes =  matrix_crimes[['id_colonia', 'area_km2', 'metro', 'metrobus', 'rtp', 'supers_minisupers', 
                                'department_stores', 'banks', 'corporate_offices', 'restaurants', 'g_edu', 
                                'no_healt_s', 'house_per_km2']]

matrix_crimes.drop_duplicates(inplace=True)

grid = grid.merge(matrix_crimes, on='id_colonia',
                  how='left')

In [8]:
## Read neighborhoods DataFrame
colonias = gpd.read_file(create_path(r"modelo\\colonias_fixed\\colonias_fixed.shp"))

## Declare the used coordinate system
colonias.crs = "epsg:4326"

## Print Coordinate Reference System (CRS)
# This is importan to homologue all crs from other georeferenced DataFrames
colonias.crs

## Merge to locate the Municipality for each nieghborhood
grid = grid.merge(colonias[['id_colonia', 'alcaldi']],
                            how='left',
                            on='id_colonia')


In [9]:
def memory_usage(df):
    return(round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2))

print('Memory used:', memory_usage(grid), 'Mb')

Memory used: 321.83 Mb


In [10]:
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1216992 entries, 0 to 1216991
Data columns (total 26 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id_colonia               1216992 non-null  int64  
 1   day_period               1216992 non-null  int64  
 2   month                    1216992 non-null  int64  
 3   dia_semana               1216992 non-null  int64  
 4   no_labor_day             1216992 non-null  int64  
 5   TEMP                     1216992 non-null  float64
 6   PRCP                     1216992 non-null  float64
 7   CO                       1216992 non-null  float64
 8   O3                       1216992 non-null  float64
 9   PM10                     1216992 non-null  float64
 10  past_near_crimes_500mts  1216992 non-null  float64
 11  past_crimes              1216992 non-null  float64
 12  covid_dummy              1216992 non-null  int64  
 13  area_km2                 1216992 non-null 

In [11]:
## Create a empty dataframe to fill with crime probability
grid_pred = pd.DataFrame({})

for alcaldi in grid['alcaldi'].unique():
    
    grid_alc = grid[grid['alcaldi'] == alcaldi].copy()
    
    grid_alc = grid_alc.drop(['alcaldi'], axis = 1)
    
    ## Create dummies
    grid_alc_dum = pd.get_dummies(grid_alc, columns=['id_colonia', 'day_period', 'dia_semana', 'month'],
                                   prefix=["colonia", "day_per", "weekday", "month"], sparse=True)
    
    ## Open de column names of the original dataframe used to grow the model 
    col_list_name = f"{(alcaldi).replace('.', '').replace(' ', '_').lower()}_col_names.pkl"

    save_columns = pickle.load(open(create_path(r"col_names\\" + col_list_name), "rb"))

    ## Verify column order
    grid_alc_dum = grid_alc_dum[save_columns]
    
    file_name = f"{(alcaldi).replace('.', '').replace(' ', '_').lower()}_brf_model.joblib"

    brf = load(create_path(r"brf_models\\" + file_name))
    
    brf_pred_prob = brf.predict_proba(grid_alc_dum)
    
    del brf, grid_alc_dum

    ## Extract probability of crimes or ones
    crime_prob = [prob[1] for prob in brf_pred_prob]
    
    ## Use original and Undummie DataFrame
    grid_alc["proba_crimen"] = crime_prob
    
    grid_pred = pd.concat([grid_pred, grid_alc])
    
    del grid_alc, crime_prob
    
    print('Memory used:', memory_usage(grid_pred), 'Mb')
    print(alcaldi)
    

Memory used: 12.18 Mb
MIGUEL HIDALGO
Memory used: 33.5 Mb
COYOACAN
Memory used: 44.71 Mb
VENUSTIANO CARRANZA
Memory used: 51.91 Mb
LA MAGDALENA CONTRERAS
Memory used: 60.77 Mb
BENITO JUAREZ
Memory used: 92.89 Mb
GUSTAVO A. MADERO
Memory used: 117.39 Mb
TLALPAN
Memory used: 132.75 Mb
AZCAPOTZALCO
Memory used: 143.69 Mb
XOCHIMILCO
Memory used: 145.21 Mb
MILPA ALTA
Memory used: 179.82 Mb
ALVARO OBREGON
Memory used: 187.43 Mb
IZTACALCO
Memory used: 196.15 Mb
CUAUHTEMOC
Memory used: 204.18 Mb
TLAHUAC
Memory used: 244.74 Mb
IZTAPALAPA
Memory used: 250.69 Mb
CUAJIMALPA DE MORELOS


In [12]:
## Drop not relevante features to our map of probabilities
not_import_features = ['area_km2', 'metro', 'metrobus', 'rtp',
                       'supers_minisupers', 'department_stores', 'banks', 'corporate_offices',
                       'restaurants', 'g_edu', 'no_healt_s', 'house_per_km2',
                       'TEMP', 'PRCP', 'CO', 'O3', 'PM10', 'past_near_crimes_500mts',
                       'past_crimes', 'covid_dummy']


grid_pred.drop(columns=not_import_features, axis=1, inplace=True)


In [13]:
print('Memory used:', memory_usage(grid_pred), 'Mb')

Memory used: 64.99 Mb


In [14]:
grid_pred.to_csv(create_path(r"modelo\df_results\grid_prediction_df.csv"), index=False)