# Transform and Ingest Population

## Department Population

### Department Population History

In [6]:
import pathlib
import pandas as pd

data_dir = pathlib.Path.home() / "ds4a" / "project" / "infrastructure" / "data"
dept_pop = pd.read_csv(
    data_dir / "department_population_2016_2022.csv",
    sep=";",
    usecols=['DP', 'DPNOM', 'AÑO', 'Total Hombres', 'Total Mujeres', 'Total']
)

dept_pop.head()

Unnamed: 0,DP,DPNOM,AÑO,Total Hombres,Total Mujeres,Total
0,5,Antioquia,2016,3000534,3210778,6211312
1,8,Atlántico,2016,1182828,1244218,2427046
2,11,"Bogotá, D.C.",2016,3495609,3805309,7300918
3,13,Bolívar,2016,1000545,1012465,2013010
4,15,Boyacá,2016,590818,609160,1199978


In [7]:
len(dept_pop)

231

In [8]:
dept_pop_hist = dept_pop[['AÑO', 'Total Hombres', 'Total Mujeres', 'Total', 'DP']]
dept_pop_hist

Unnamed: 0,AÑO,Total Hombres,Total Mujeres,Total,DP
0,2016,3000534,3210778,6211312,5
1,2016,1182828,1244218,2427046,8
2,2016,3495609,3805309,7300918,11
3,2016,1000545,1012465,2013010,13
4,2016,590818,609160,1199978,15
...,...,...,...,...,...
226,2022,42406,39662,82068,91
227,2022,26940,25121,52061,94
228,2022,47393,42964,90357,95
229,2022,25520,23412,48932,97


In [9]:
(dept_pop_hist["Total"] == (dept_pop_hist["Total Hombres"] + dept_pop_hist["Total Mujeres"])).mean()

1.0

Keep total column is not necessary since for every row it is equal to the sum of total women and total men

### Departments Table

In [10]:
dept_pop["DPNOM"].nunique(), dept_pop["DP"].nunique()

(33, 33)

In [11]:
dept_pop["DP"].unique()

array([ 5,  8, 11, 13, 15, 17, 18, 19, 20, 23, 25, 27, 41, 44, 47, 50, 52,
       54, 63, 66, 68, 70, 73, 76, 81, 85, 86, 88, 91, 94, 95, 97, 99])

In [12]:
departments = dept_pop[["DP", "DPNOM"]].drop_duplicates().set_index("DP")
departments

Unnamed: 0_level_0,DPNOM
DP,Unnamed: 1_level_1
5,Antioquia
8,Atlántico
11,"Bogotá, D.C."
13,Bolívar
15,Boyacá
17,Caldas
18,Caquetá
19,Cauca
20,Cesar
23,Córdoba


In [13]:
departments["DPNOM"].apply(len).max()

26

## Municipality Population

### Municipality Population History

In [14]:
mun_pop = pd.read_csv(
    data_dir / "municipality_population_2016_2022.csv",
    sep=";"
)

mun_pop.head()

Unnamed: 0,DP,DPNOM,COD_MUNICIPIO,MPNOM,AÑO,ÁREA GEOGRÁFICA,Total
0,5,Antioquia,5001,Medellín,2016,Total,2351077
1,5,Antioquia,5002,Abejorral,2016,Total,20534
2,5,Antioquia,5004,Abriaquí,2016,Total,2629
3,5,Antioquia,5021,Alejandría,2016,Total,4620
4,5,Antioquia,5030,Amagá,2016,Total,29394


In [15]:
mun_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7854 entries, 0 to 7853
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   DP               7854 non-null   int64 
 1   DPNOM            7854 non-null   object
 2   COD_MUNICIPIO    7854 non-null   int64 
 3   MPNOM            7854 non-null   object
 4   AÑO              7854 non-null   int64 
 5   ÁREA GEOGRÁFICA  7854 non-null   object
 6   Total            7854 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 429.6+ KB


In [16]:
mun_pop_filt = mun_pop[["AÑO", "Total", "COD_MUNICIPIO"]]
mun_pop_filt

Unnamed: 0,AÑO,Total,COD_MUNICIPIO
0,2016,2351077,5001
1,2016,20534,5002
2,2016,2629,5004
3,2016,4620,5021
4,2016,29394,5030
...,...,...,...
7849,2022,1211,97889
7850,2022,20798,99001
7851,2022,9846,99524
7852,2022,4297,99624


### Municipality Table

In [11]:
municipalities = mun_pop[["DP", "COD_MUNICIPIO", "MPNOM"]].drop_duplicates().set_index("COD_MUNICIPIO")
municipalities.head()

Unnamed: 0_level_0,DP,MPNOM
COD_MUNICIPIO,Unnamed: 1_level_1,Unnamed: 2_level_1
5001,5,Medellín
5002,5,Abejorral
5004,5,Abriaquí
5021,5,Alejandría
5030,5,Amagá


In [12]:
municipalities["LEN"] = municipalities["MPNOM"].apply(len)
municipalities.sort_values(by="LEN", ascending=False).head()

Unnamed: 0_level_0,DP,MPNOM,LEN
COD_MUNICIPIO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25843,25,Villa de San Diego de Ubaté,27
47692,47,San Sebastián de Buenavista,27
25645,25,San Antonio del Tequendama,26
23586,23,Purísima de La Concepción,25
5664,5,San Pedro de Los Milagros,25


## Interfamily Violence

In [26]:
interfamily_violence = pd.read_csv(
    data_dir / "interfamily_violence_2010_2022.csv",
)

weapon_means = interfamily_violence["ARMAS MEDIOS"].replace({
    "ARMA BLANCA / CORTOPUNZANTE": "CORTOPUNZANTES",
    "NO REPORTA": "NO REPORTADO",
    "CORTANTES": "CORTOPUNZANTES",
    "PUNZANTES": "CORTOPUNZANTES"
}).drop_duplicates()

weapon_means.reset_index(drop=True)

0         CORTOPUNZANTES
1          ARMA DE FUEGO
2           CONTUNDENTES
3           NO REPORTADO
4    SIN EMPLEO DE ARMAS
5           ESCOPOLAMINA
6                      -
Name: ARMAS MEDIOS, dtype: object

In [17]:
weapon_means.apply(len).max()

27

In [19]:
interfamily_violence["GRUPO ETARIO"].replace({
    "NO REPORTA": "NO REPORTADO"
}).value_counts()

ADULTOS         464213
ADOLESCENTES     32076
MENORES          30601
NO REPORTADO       371
Name: GRUPO ETARIO, dtype: int64

## 