# Homologation of **colonias**

The purpose of this notebook is to homologate the *colonia* values in the *crimes_clean* dataset using a JSON file called *catalogo-de-colonias.json*, which includes names and keys of the location in detail.

In [100]:
import pandas as pd
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import json

#Path
#Insert your local repo path to file 
repo_path = "INSERT YOUR LOCAL SDC-SECURITY REPO PATH HERE"
repo_path ='D:\Archivos\Social Data Challenge\sdc-security'

In [101]:
crime_data_path = "\datasets\crimes_clean.csv.zip"

zf = zipfile.ZipFile(repo_path+crime_data_path) 
crimes = pd.read_csv(zf.open('crimes_clean.csv'))
print(crimes.shape)


#Changing categorical variable 'sexo' to numerical

#Non optimal way of doing this is using lambda functions
#crimes["sexo_fem"] = crimes['sexo'].apply(lambda x: 1 if x.upper() == 'FEMENINO' else 0)

#np.where uses vectorized operations which are highly optimized
if 'sexo' in crimes.columns:
    crimes['sexo_fem'] = np.where(crimes['sexo'].str.upper() == 'FEMENINO', 1, 0)
    del crimes["sexo"]

crimes.head()


(983855, 23)


Unnamed: 0,idcarpeta,delito,categoria,alcaldia,colonia,sexo_fem,edad,tipopersona,calidadjuridica,anio_denuncia,...,competencia,anio_hecho,mes_hecho,fecha_hecho,hora_hecho,colonia_alt,crimen_lat,crimen_lon,hecho_time,denuncia_time
0,8324429,FRAUDE,DELITO DE BAJO IMPACTO,ALVARO OBREGON,GUADALUPE INN,0,62,FISICA,OFENDIDO,2019,...,FUERO COMUN,2018,8,2018-08-29,12:00:00,GUADALUPE INN,19.36125,-99.18314,2018-08-29 12:00:00,2019-01-04 12:19:00
1,8324430,"PRODUCCIÓN, IMPRESIÓN, ENAJENACIÓN, DISTRIBUCI...",DELITO DE BAJO IMPACTO,AZCAPOTZALCO,VICTORIA DE LAS DEMOCRACIAS,1,38,FISICA,VICTIMA Y DENUNCIANTE,2019,...,FUERO COMUN,2018,12,2018-12-15,15:00:00,VICTORIA DE LAS DEMOCRACIAS,19.47181,-99.16458,2018-12-15 15:00:00,2019-01-04 12:20:00
2,8324431,ROBO A TRANSEUNTE SALIENDO DEL BANCO CON VIOLE...,ROBO A CUENTAHABIENTE SALIENDO DEL CAJERO CON ...,COYOACAN,COPILCO UNIVERSIDAD ISSSTE,0,42,FISICA,VICTIMA Y DENUNCIANTE,2019,...,FUERO COMUN,2018,12,2018-12-22,15:30:00,COPILCO EL BAJO,19.33797,-99.18611,2018-12-22 15:30:00,2019-01-04 12:23:00
3,8324435,ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,IZTACALCO,AGRÍCOLA PANTITLAN,0,35,FISICA,VICTIMA Y DENUNCIANTE,2019,...,FUERO COMUN,2019,1,2019-01-04,06:00:00,PANTITLAN V,19.40327,-99.05983,2019-01-04 06:00:00,2019-01-04 12:27:00
4,8324438,ROBO DE MOTOCICLETA SIN VIOLENCIA,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,IZTAPALAPA,PROGRESISTA,0,30,FISICA,VICTIMA,2019,...,FUERO COMUN,2019,1,2019-01-03,20:00:00,LAS AMERICAS (U HAB),19.3548,-99.06324,2019-01-03 20:00:00,2019-01-04 12:35:00


### Extracting info from JSON File 'catalogo-de-colonias.json' 

In [102]:
catalog_path = "\datasets\catalogo-de-colonias.json"

# Load the JSON file
with open(repo_path + catalog_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Extracting features from the JSON data
features = data['features']

# Initialize lists to store column values
polygon_list = []
properties_list = []

# Extracting values from features
for feature in features:
    polygon_list.append(feature['geometry']['coordinates'][0])  # Extracting the first set of coordinates
    properties_list.append(feature['properties'])

# Creating the DataFrame
catalog = pd.DataFrame(properties_list)
catalog['polygon'] = polygon_list

# Reordering columns
column_order = ['cve_col', 'clasif', 'colonia', 'alc', 'cve_alc', 'cve_ent', 'entidad', 'polygon']
catalog= catalog[column_order]


# Display the DataFrame
catalog.head()


Unnamed: 0,cve_col,clasif,colonia,alc,cve_alc,cve_ent,entidad,polygon
0,002-001,Colonia,Aguilera,Azcapotzalco,2,9,Ciudad de México,"[[-99.15913100449802, 19.47261175522339], [-99..."
1,002-002,Colonia,Aldana,Azcapotzalco,2,9,Ciudad de México,"[[-99.15171083760227, 19.468190578205796], [-9..."
2,002-003,Colonia,Ampliacion Cosmopolita,Azcapotzalco,2,9,Ciudad de México,"[[-99.16361818076089, 19.472229875914007], [-9..."
3,002-004,Colonia,Ampliacion Del Gas,Azcapotzalco,2,9,Ciudad de México,"[[-99.16151817603475, 19.46774486135027], [-99..."
4,002-005,Colonia,Ampliacion Petrolera,Azcapotzalco,2,9,Ciudad de México,"[[-99.19761990655803, 19.482307007833892], [-9..."


Reviewing how does a value inside 'polygon' looks like


In [103]:
catalog["polygon"].loc[0]

[[-99.15913100449802, 19.47261175522339],
 [-99.15905951558406, 19.473285887676024],
 [-99.15900881461089, 19.47368593671133],
 [-99.15794643848542, 19.473632739085684],
 [-99.15616814209562, 19.47354369195322],
 [-99.15442193075381, 19.473380151110387],
 [-99.15266341191717, 19.47321472340822],
 [-99.15275750249315, 19.47264915251562],
 [-99.1528071338663, 19.4723511157457],
 [-99.15292459787696, 19.471676013892726],
 [-99.15351363460762, 19.47176437869369],
 [-99.15501209831677, 19.47198914429913],
 [-99.15500458310345, 19.472030004073765],
 [-99.15510539658004, 19.47200313753843],
 [-99.15520542932478, 19.47197558462651],
 [-99.15519777130277, 19.47201698619364],
 [-99.15580174927318, 19.472107529770657],
 [-99.15640573743234, 19.47219806231646],
 [-99.15814628678987, 19.472462267199276],
 [-99.15913100449802, 19.47261175522339]]

In [104]:
print(catalog.clasif.unique())
print(catalog.entidad.unique())
print(catalog.cve_ent.unique())

['Colonia' 'Pueblos y Barrios Originarios']
['Ciudad de México']
['09']


Erasing cve_ent and entidad, since it is the same on the whole catalog


In [105]:
del catalog["cve_ent"]
del catalog["entidad"]

catalog.head()

Unnamed: 0,cve_col,clasif,colonia,alc,cve_alc,polygon
0,002-001,Colonia,Aguilera,Azcapotzalco,2,"[[-99.15913100449802, 19.47261175522339], [-99..."
1,002-002,Colonia,Aldana,Azcapotzalco,2,"[[-99.15171083760227, 19.468190578205796], [-9..."
2,002-003,Colonia,Ampliacion Cosmopolita,Azcapotzalco,2,"[[-99.16361818076089, 19.472229875914007], [-9..."
3,002-004,Colonia,Ampliacion Del Gas,Azcapotzalco,2,"[[-99.16151817603475, 19.46774486135027], [-99..."
4,002-005,Colonia,Ampliacion Petrolera,Azcapotzalco,2,"[[-99.19761990655803, 19.482307007833892], [-9..."


After erasing some columns that will not be used, spanish accented character should be replaced by their non-accented counterpart

In [106]:
pip install unidecode





In [107]:
from unidecode import unidecode


catalog_cat_cols = ['clasif', 'colonia', 'alc']

crimes_cat_cols  = ['delito', 'categoria', 'alcaldia', 'colonia',
                    'colonia_alt', 'tipopersona', 'calidadjuridica',
                    'competencia']

#Dataframes and columns to be processed in dictionary
categorical_dict = {'catalog': catalog_cat_cols, 'crimes':  crimes_cat_cols}

for df_name, columns in categorical_dict.items():
    df = globals()[df_name]
    for col in columns:
        df[col] = df[col].apply(unidecode)
        df[col] = df[col].str.upper()

catalog.head()

Unnamed: 0,cve_col,clasif,colonia,alc,cve_alc,polygon
0,002-001,COLONIA,AGUILERA,AZCAPOTZALCO,2,"[[-99.15913100449802, 19.47261175522339], [-99..."
1,002-002,COLONIA,ALDANA,AZCAPOTZALCO,2,"[[-99.15171083760227, 19.468190578205796], [-9..."
2,002-003,COLONIA,AMPLIACION COSMOPOLITA,AZCAPOTZALCO,2,"[[-99.16361818076089, 19.472229875914007], [-9..."
3,002-004,COLONIA,AMPLIACION DEL GAS,AZCAPOTZALCO,2,"[[-99.16151817603475, 19.46774486135027], [-99..."
4,002-005,COLONIA,AMPLIACION PETROLERA,AZCAPOTZALCO,2,"[[-99.19761990655803, 19.482307007833892], [-9..."


Focusing on the CUAUHTEMOC district, a slice of this catalog containg only information related to that district will be created


In [108]:
catalog_cuau = catalog.loc[ catalog["alc"] == 'CUAUHTEMOC'].copy()
catalog_cuau.head()

Unnamed: 0,cve_col,clasif,colonia,alc,cve_alc,polygon
1357,015-001,COLONIA,ALGARIN,CUAUHTEMOC,15,"[[-99.14499269220823, 19.403872867220105], [-9..."
1358,015-002,COLONIA,AMPL. ASTURIAS,CUAUHTEMOC,15,"[[-99.13609631370444, 19.40605476870975], [-99..."
1359,015-003,COLONIA,ASTURIAS,CUAUHTEMOC,15,"[[-99.136483965521, 19.403389149126806], [-99...."
1360,015-004,COLONIA,ATLAMPA,CUAUHTEMOC,15,"[[-99.16375908271087, 19.45579888070804], [-99..."
1361,015-005,COLONIA,BUENAVISTA,CUAUHTEMOC,15,"[[-99.15567247450599, 19.439779410122352], [-9..."


In [109]:
# How many colonias are related to Cuauhtemoc district on this catalog?
catalog_cuau.colonia.unique().shape[0]

33

The next step is to homologate the *colonia* names in the *crimes* dataset and fix the names when needed so that, in future merges with other datasets, the catalog is used as the naming convention.

In [110]:
# Reordering columns
column_order = ['idcarpeta', 'delito', 'categoria', 'alcaldia', 'colonia', 'sexo_fem',
                'edad', 'tipopersona', 'calidadjuridica', 'anio_denuncia', 
                'mes_denuncia', 'fecha_denuncia', 'hora_denuncia', 'competencia', 
                'anio_hecho','mes_hecho', 'fecha_hecho', 'hora_hecho', 'colonia_alt', 
                'crimen_lat', 'crimen_lon', 'hecho_time', 'denuncia_time']
crimes= crimes[column_order]
crimes.head()


Unnamed: 0,idcarpeta,delito,categoria,alcaldia,colonia,sexo_fem,edad,tipopersona,calidadjuridica,anio_denuncia,...,competencia,anio_hecho,mes_hecho,fecha_hecho,hora_hecho,colonia_alt,crimen_lat,crimen_lon,hecho_time,denuncia_time
0,8324429,FRAUDE,DELITO DE BAJO IMPACTO,ALVARO OBREGON,GUADALUPE INN,0,62,FISICA,OFENDIDO,2019,...,FUERO COMUN,2018,8,2018-08-29,12:00:00,GUADALUPE INN,19.36125,-99.18314,2018-08-29 12:00:00,2019-01-04 12:19:00
1,8324430,"PRODUCCION, IMPRESION, ENAJENACION, DISTRIBUCI...",DELITO DE BAJO IMPACTO,AZCAPOTZALCO,VICTORIA DE LAS DEMOCRACIAS,1,38,FISICA,VICTIMA Y DENUNCIANTE,2019,...,FUERO COMUN,2018,12,2018-12-15,15:00:00,VICTORIA DE LAS DEMOCRACIAS,19.47181,-99.16458,2018-12-15 15:00:00,2019-01-04 12:20:00
2,8324431,ROBO A TRANSEUNTE SALIENDO DEL BANCO CON VIOLE...,ROBO A CUENTAHABIENTE SALIENDO DEL CAJERO CON ...,COYOACAN,COPILCO UNIVERSIDAD ISSSTE,0,42,FISICA,VICTIMA Y DENUNCIANTE,2019,...,FUERO COMUN,2018,12,2018-12-22,15:30:00,COPILCO EL BAJO,19.33797,-99.18611,2018-12-22 15:30:00,2019-01-04 12:23:00
3,8324435,ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...,ROBO DE VEHICULO CON Y SIN VIOLENCIA,IZTACALCO,AGRICOLA PANTITLAN,0,35,FISICA,VICTIMA Y DENUNCIANTE,2019,...,FUERO COMUN,2019,1,2019-01-04,06:00:00,PANTITLAN V,19.40327,-99.05983,2019-01-04 06:00:00,2019-01-04 12:27:00
4,8324438,ROBO DE MOTOCICLETA SIN VIOLENCIA,ROBO DE VEHICULO CON Y SIN VIOLENCIA,IZTAPALAPA,PROGRESISTA,0,30,FISICA,VICTIMA,2019,...,FUERO COMUN,2019,1,2019-01-03,20:00:00,LAS AMERICAS (U HAB),19.3548,-99.06324,2019-01-03 20:00:00,2019-01-04 12:35:00


In [111]:
crimes_cuau = crimes[crimes.alcaldia == 'CUAUHTEMOC'].copy()

Checking how many rows are on each dataframe and how many are left when a merge is done

In [112]:
print('Crimes dataframe before merge contains {} rows'.format(crimes_cuau.shape[0]))
print(crimes_cuau.isnull().sum())

print('Catalog dataframe before merge contains {} rows'.format(catalog_cuau.shape[0]))
print(catalog_cuau.isnull().sum())

merge_colonias_cuau = crimes_cuau.merge(catalog_cuau, left_on = ['alcaldia', 'colonia'],
                                 right_on = ['alc','colonia'], how = 'left')

print('Merge dataframe contains {} rows'.format(merge_colonias_cuau.shape[0]))
print(merge_colonias_cuau.isnull().sum())
merge_colonias_cuau.head()

Crimes dataframe before merge contains 135947 rows
idcarpeta          0
delito             0
categoria          0
alcaldia           0
colonia            0
sexo_fem           0
edad               0
tipopersona        0
calidadjuridica    0
anio_denuncia      0
mes_denuncia       0
fecha_denuncia     0
hora_denuncia      0
competencia        0
anio_hecho         0
mes_hecho          0
fecha_hecho        0
hora_hecho         0
colonia_alt        0
crimen_lat         0
crimen_lon         0
hecho_time         0
denuncia_time      0
dtype: int64
Catalog dataframe before merge contains 33 rows
cve_col    0
clasif     0
colonia    0
alc        0
cve_alc    0
polygon    0
dtype: int64
Merge dataframe contains 135947 rows
idcarpeta             0
delito                0
categoria             0
alcaldia              0
colonia               0
sexo_fem              0
edad                  0
tipopersona           0
calidadjuridica       0
anio_denuncia         0
mes_denuncia          0
fecha_denunci

Unnamed: 0,idcarpeta,delito,categoria,alcaldia,colonia,sexo_fem,edad,tipopersona,calidadjuridica,anio_denuncia,...,colonia_alt,crimen_lat,crimen_lon,hecho_time,denuncia_time,cve_col,clasif,alc,cve_alc,polygon
0,8324479,ROBO A TRANSEUNTE A BORDO DE TAXI PUBLICO Y PR...,DELITO DE BAJO IMPACTO,CUAUHTEMOC,DOCTORES,0,39,FISICA,VICTIMA Y DENUNCIANTE,2019,...,DOCTORES I,19.42244,-99.15237,2018-12-25 04:00:00,2019-01-04 13:16:00,015-010,COLONIA,CUAUHTEMOC,15,"[[-99.15519890116673, 19.40656058216308], [-99..."
1,8324482,ROBO DE OBJETOS,DELITO DE BAJO IMPACTO,CUAUHTEMOC,MORELOS,0,40,FISICA,OFENDIDO,2019,...,MORELOS II,19.44962,-99.12782,2018-12-23 10:00:00,2019-01-04 13:19:00,015-019,COLONIA,CUAUHTEMOC,15,"[[-99.13938480039275, 19.44401178894581], [-99..."
2,8324485,ROBO DE OBJETOS,DELITO DE BAJO IMPACTO,CUAUHTEMOC,ROMA NORTE,0,37,FISICA,OFENDIDO,2019,...,ROMA NORTE III,19.41359,-99.15729,2018-12-31 11:00:00,2019-01-04 13:23:00,015-024,COLONIA,CUAUHTEMOC,15,"[[-99.17671177030077, 19.420286327194944], [-9..."
3,8324496,ROBO A CASA HABITACION SIN VIOLENCIA,DELITO DE BAJO IMPACTO,CUAUHTEMOC,JUAREZ,0,36,FISICA,VICTIMA Y DENUNCIANTE,2019,...,JUAREZ,19.42676,-99.1664,2019-01-03 15:00:00,2019-01-04 13:35:00,015-017,COLONIA,CUAUHTEMOC,15,"[[-99.17571912544206, 19.422519931378844], [-9..."
4,8324527,ROBO A PASAJERO A BORDO DE METRO SIN VIOLENCIA,ROBO A PASAJERO A BORDO DEL METRO CON Y SIN VI...,CUAUHTEMOC,GUERRERO,0,32,FISICA,VICTIMA Y DENUNCIANTE,2019,...,GUERRERO III,19.43769,-99.14753,2018-12-22 18:34:00,2019-01-04 13:59:00,015-014,COLONIA,CUAUHTEMOC,15,"[[-99.14910747563131, 19.437890366845085], [-9..."


The following code block uses the rows with null values to identify which *colonias* values need to be homologated with the *catalogo-de-colonias.json* so that a successful merge is done after that.

By checking values inside the *catalogo-de-colonias.json* and validating with the *crimes.colonia_alt* column, a dictionary with substitute *colonia* values is presented. 


In [113]:
condition = merge_colonias_cuau.cve_col.isnull()
missing_colonias = list(merge_colonias_cuau.loc[condition,'colonia'].unique())
missing_colonias
#merge_colonias_cuau[merge_colonias_cuau.cve_col.isnull()]

['AMPLIACION ASTURIAS',
 'EX-HIPODROMO DE PERALVILLO',
 'CENTRO URBANO BENITO JUAREZ',
 'TLATILCO',
 'CENTRO IV',
 'DOCTORES I',
 'PERALVILLO II',
 'CENTRO VIII',
 'ROMA NORTE II',
 'OBRERA IV',
 'ROMA NORTE I',
 'ROMA SUR I',
 'ROMA NORTE III',
 'SANTA MARIA LA RIBERA IV',
 'MORELOS III',
 'GUERRERO III',
 'ROMA SUR II',
 'DOCTORES V',
 'ZONA CENTRO',
 'NICOLAS BRAVO',
 'SAN MIGUEL CHAPULTEPEC',
 'AMPLIACION DEL GAS',
 'LORENZO BOTURINI',
 'VALLEJO',
 'MERCED BALBUENA',
 'AGRICULTURA']

The substitutions will be made from the *crimes_clean* dataframe in case a future analysis with other alcaldias is needed. Because of this, the merge with the *catalogo* dataframe will be done again.

A lot of inconsistencies happen because of the roman numerals, so these will be removed.

Checking the number of ocurrences of each missing colonia value. If the number is low and it does not contain roman numerals, it is possible that the *colonia* actually belongs to other *alcaldia*

In [114]:
for colonia in missing_colonias:
    print(colonia + ': '+ str(crimes_cuau[crimes_cuau.colonia == colonia].shape))


AMPLIACION ASTURIAS: (1128, 23)
EX-HIPODROMO DE PERALVILLO: (1630, 23)
CENTRO URBANO BENITO JUAREZ: (202, 23)
TLATILCO: (4, 23)
CENTRO IV: (7, 23)
DOCTORES I: (2, 23)
PERALVILLO II: (1, 23)
CENTRO VIII: (4, 23)
ROMA NORTE II: (1, 23)
OBRERA IV: (1, 23)
ROMA NORTE I: (2, 23)
ROMA SUR I: (3, 23)
ROMA NORTE III: (1, 23)
SANTA MARIA LA RIBERA IV: (2, 23)
MORELOS III: (2, 23)
GUERRERO III: (2, 23)
ROMA SUR II: (1, 23)
DOCTORES V: (1, 23)
ZONA CENTRO: (13, 23)
NICOLAS BRAVO: (3, 23)
SAN MIGUEL CHAPULTEPEC: (6, 23)
AMPLIACION DEL GAS: (2, 23)
LORENZO BOTURINI: (1, 23)
VALLEJO: (2, 23)
MERCED BALBUENA: (1, 23)
AGRICULTURA: (1, 23)


While checking the coordinates in dataset, it was noticed that there are rows which contain the same values of crimen_lat and crimen_lon. This rows will be deleted for now, since they wont be useful for geographical location

In [115]:
coordinates_condition = crimes.crimen_lat == crimes.crimen_lon 
print(crimes.loc[coordinates_condition].shape)

print('With those rows, shape is ',crimes.shape)
crimes=crimes[~coordinates_condition]
print('Without those rows, shape is ',crimes.shape)

(19871, 23)
With those rows, shape is  (983855, 23)
Without those rows, shape is  (963984, 23)


A validation was made using the crime_lat and crime_long columns and Google Maps so that, for each case, it is known if the *colonia* is the one to be fixed or the *alcaldia* column

In [116]:
homologation_dict = {
    'AMPLIACION ASTURIAS': 'AMPL. ASTURIAS',
    'CENTRO URBANO BENITO JUAREZ': 'ROMA SUR',
    'EX-HIPODROMO DE PERALVILLO': 'EX HIPODROMO DE PERALVILLO',
    'LORENZO BOTURINI':'TRANSITO',
    'MERCED BALBUENA': 'ESPERANZA',
    'SAN MIGUEL CHAPULTEPEC':'CONDESA',
    'ZONA CENTRO': 'CENTRO'
}

alcaldia_homologation = {
    'AGRICULTURA': 'MIGUEL HIDALGO',
    'AMPLIACION DEL GAS':'AZCAPOTZALCO',
    'NICOLAS BRAVO': 'VENUSTIANO CARRANZA',
    'TLATILCO': 'AZCAPOTZALCO',
    'VALLEJO': 'GUSTAVO A. MADERO',
}



In [117]:
import re
# Function to remove Roman numerals
def remove_roman_numerals(text):
    return re.sub(r'\s*\b[IVXLCDM]+\b\s*', '', text)

In [118]:
condition =  crimes['alcaldia'] == 'CUAUHTEMOC'
condition2 = crimes['colonia'].isin(missing_colonias) 

crimes.loc[condition & condition2, 'colonia'] = crimes['colonia'].apply(remove_roman_numerals)
crimes.loc[condition, 'colonia'] = crimes.loc[condition, 'colonia'].replace(homologation_dict)
crimes['alcaldia'] = crimes['colonia'].map(alcaldia_homologation).fillna(crimes['alcaldia'])


crimes_cuau = crimes[crimes.alcaldia == 'CUAUHTEMOC'].copy()

Validating that all the rows were merged succesfully without null values

In [122]:
print('Crimes dataframe before merge contains {} rows'.format(crimes_cuau.shape[0]))
print(crimes_cuau.isnull().sum())

print('Catalog dataframe before merge contains {} rows'.format(catalog_cuau.shape[0]))
print(catalog_cuau.isnull().sum())

merge_colonias_cuau = crimes_cuau.merge(catalog_cuau, left_on = ['alcaldia', 'colonia'],
                                 right_on = ['alc','colonia'], how = 'left')

print('Merge dataframe contains {} rows'.format(merge_colonias_cuau.shape[0]))
print(merge_colonias_cuau.isnull().sum())


Crimes dataframe before merge contains 133215 rows
idcarpeta          0
delito             0
categoria          0
alcaldia           0
colonia            0
sexo_fem           0
edad               0
tipopersona        0
calidadjuridica    0
anio_denuncia      0
mes_denuncia       0
fecha_denuncia     0
hora_denuncia      0
competencia        0
anio_hecho         0
mes_hecho          0
fecha_hecho        0
hora_hecho         0
colonia_alt        0
crimen_lat         0
crimen_lon         0
hecho_time         0
denuncia_time      0
dtype: int64
Catalog dataframe before merge contains 33 rows
cve_col    0
clasif     0
colonia    0
alc        0
cve_alc    0
polygon    0
dtype: int64
Merge dataframe contains 133215 rows
idcarpeta          0
delito             0
categoria          0
alcaldia           0
colonia            0
sexo_fem           0
edad               0
tipopersona        0
calidadjuridica    0
anio_denuncia      0
mes_denuncia       0
fecha_denuncia     0
hora_denuncia      0
comp

In [123]:
condition = merge_colonias_cuau.cve_col.isnull()
missing_colonias = list(merge_colonias_cuau.loc[condition,'colonia'].unique())
missing_colonias

[]

In [124]:
crimes.to_csv('../datasets/crimes_clean2.csv.zip', compression = 'zip', index=False)


In [125]:
merge_colonias_cuau.to_csv('../datasets/cuau_homologated.csv.zip', compression = 'zip', index=False)

In [126]:
catalog.to_csv('../datasets/catalogo_colonias.csv.zip', compression = 'zip', index=False)
