## Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append("../libs")
sys.path.append("../")
from definitions import ROOT_DIR
import utils as utils

# Set pandas options
# Not use scientific notation and use two decimal places and use comma as thousand separator
pd.options.display.float_format = '{:,.2f}'.format


# ETL

## 1. Load dataset

In [2]:
# Load dataset from data_files/internet.xlsx - sheet: 'Ingresos '
df = utils.get_xls_sheet_data('/data_files/internet.xlsx', 'Penetración-poblacion')
df.head().style.format(thousands=',')

Unnamed: 0,Año,Trimestre,Provincia,Accesos por cada 100 hab
0,2023,4,Buenos Aires,27.68
1,2023,4,Capital Federal,47.8
2,2023,4,Catamarca,17.46
3,2023,4,Chaco,11.85
4,2023,4,Chubut,26.04


## 2. Handling missing values

### Look for missing values in all cells

In [3]:
#Find missing values
df.isnull().sum()

Año                         0
Trimestre                   0
Provincia                   0
Accesos por cada 100 hab    0
dtype: int64

#### There's no missing values

## 3. Look for Duplicates

### Find duplicates for complete rows

In [4]:
#Find duplicates by complete row
df.duplicated().sum()

0

#### There's no complete duplicated rows

### Fin duplicated rows for year and quarter ('Trimestre') and state ('Provincia')

In [5]:
#Find duplicated rows by row, for year and quarter
df.duplicated(subset=['Año', 'Trimestre', 'Provincia']).sum()

0

#### There's no duplicated rows for year and quarter

## 4. Finding outliers

In [6]:
#Finding outliers
df.describe()

Unnamed: 0,Año,Trimestre,Accesos por cada 100 hab
count,1000.0,1000.0,1000.0
mean,2018.8,2.46,15.73
std,3.02,1.11,9.24
min,2014.0,1.0,2.72
25%,2016.0,1.0,9.02
50%,2019.0,2.0,13.67
75%,2021.0,3.0,20.32
max,2024.0,4.0,52.24


#### In the statistics, we can see there's no outliers values for the column year ("Año"), because the maximum value is 2024 and the data is historical. For the quarter ("Trimestre") column the minimum an maximum values are correct, 1 and 4. For the "Accesos 100/hab" column the distance from max value are close to 75 percentile. No outliers where found.

In [7]:
df.groupby(['Año','Trimestre'])['Provincia'].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Año,Trimestre,Unnamed: 2_level_1
2014,1,16
2014,2,24
2014,3,24
2014,4,24
2015,1,24
2015,2,24
2015,3,24
2015,4,24
2016,1,24
2016,2,24


#### For the year 2014, 1st quarter ("Trimestre") we have 8 provinces without information:

In [8]:
df[(df['Año'] == 2014) & (df['Trimestre'] == 1)]

Unnamed: 0,Año,Trimestre,Provincia,Accesos por cada 100 hab
984,2014,1,Buenos Aires,16.69
985,2014,1,Capital Federal,42.66
986,2014,1,Catamarca,6.18
987,2014,1,Chaco,5.55
988,2014,1,Chubut,12.7
989,2014,1,Córdoba,16.13
990,2014,1,Corrientes,5.89
991,2014,1,Entre Ríos,10.3
992,2014,1,Formosa,4.44
993,2014,1,Jujuy,4.72


#### The provinces without information are: ['Salta','San Juan', 'San Luis', 'Santa Cruz', 'Santa Fe', 'Santiago Del Estero', 'Tierra Del Fuego', 'Tucumán'] we will input 0 because the age of the lost values makes one think that there were no measurements even in that place.

In [9]:
lost_provinces = ['Salta','San Juan', 'San Luis', 'Santa Cruz', 'Santa Fe', 'Santiago Del Estero', 'Tierra Del Fuego', 'Tucumán']

for lost_province in lost_provinces:
  lost_df = df.head(1).copy()
  lost_df['Año'] = 2014
  lost_df['Trimestre'] = 1
  lost_df['Provincia'] = lost_province
  lost_df['Accesos por cada 100 hab'] = 0
  df=pd.concat([df, lost_df])

df=df.reset_index()
df[(df['Año'] == 2014) & (df['Trimestre'] == 1)]
  


Unnamed: 0,index,Año,Trimestre,Provincia,Accesos por cada 100 hab
984,984,2014,1,Buenos Aires,16.69
985,985,2014,1,Capital Federal,42.66
986,986,2014,1,Catamarca,6.18
987,987,2014,1,Chaco,5.55
988,988,2014,1,Chubut,12.7
989,989,2014,1,Córdoba,16.13
990,990,2014,1,Corrientes,5.89
991,991,2014,1,Entre Ríos,10.3
992,992,2014,1,Formosa,4.44
993,993,2014,1,Jujuy,4.72


## 5. Data Types

### We will review the data types for each column.

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     1008 non-null   int64  
 1   Año                       1008 non-null   int64  
 2   Trimestre                 1008 non-null   int64  
 3   Provincia                 1008 non-null   object 
 4   Accesos por cada 100 hab  1008 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 39.5+ KB


#### The 'Provincia' column have an object data type, and have strings values. We should change the data type to string.

In [11]:
#Change provincia to string
df['Provincia'] = df['Provincia'].astype('string')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     1008 non-null   int64  
 1   Año                       1008 non-null   int64  
 2   Trimestre                 1008 non-null   int64  
 3   Provincia                 1008 non-null   string 
 4   Accesos por cada 100 hab  1008 non-null   float64
dtypes: float64(1), int64(3), string(1)
memory usage: 39.5 KB


## New columns

### Create a new column with the year and quarter

In [12]:
#Create a new column with the quarter and year
df['Periodo'] = (df['Año'].astype(str) + 'T' + df['Trimestre'].astype(str)).astype('string')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     1008 non-null   int64  
 1   Año                       1008 non-null   int64  
 2   Trimestre                 1008 non-null   int64  
 3   Provincia                 1008 non-null   string 
 4   Accesos por cada 100 hab  1008 non-null   float64
 5   Periodo                   1008 non-null   string 
dtypes: float64(1), int64(3), string(2)
memory usage: 47.4 KB


# Save dataset

In [13]:
df=df.reset_index()
df.to_parquet(ROOT_DIR + '/data_files/penetracion_poblacion_clean.parquet')