## Load libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append("../libs")
sys.path.append("../")
from definitions import ROOT_DIR
import utils as utils

# Set pandas options
# Not use scientific notation and use two decimal places and use comma as thousand separator
pd.options.display.float_format = '{:,.2f}'.format


# ETL

## 1. Load dataset

In [12]:
# Load dataset from data_files/internet.xlsx - sheet: 'Ingresos '
df = utils.get_xls_sheet_data('/data_files/internet.xlsx', 'Penetracion-totales')
df.style.format(thousands=',')

Unnamed: 0,Año,Trimestre,Accesos por cada 100 hogares,Accesos por cada 100 hab,Periodo
0,2024,2,78.13,24.57,Abr-Jun 2024
1,2024,1,78.89,24.79,Ene-Mar 2024
2,2023,4,78.56,24.66,Oct-Dic 2023
3,2023,3,77.84,24.41,Jul-Sept 2023
4,2023,2,77.02,24.14,Abr-Jun 2023
5,2023,1,77.2,24.17,Ene-Mar 2023
6,2022,4,77.21,24.15,Oct-Dic 2022
7,2022,3,76.64,23.95,Jul-Sept 2022
8,2022,2,75.965424,23.720442,Abr-Jun 2022
9,2022,1,73.878803,23.049171,Ene-Mar 2022


## 2. Handling missing values

### Look for missing values in all cells

In [13]:
#Find missing values
df.isnull().sum()

Año                             0
Trimestre                       0
Accesos por cada 100 hogares    0
Accesos por cada 100 hab        0
Periodo                         0
dtype: int64

#### There's no missing values

## 3. Look for Duplicates

### Find duplicates for complete rows

In [14]:
#Find duplicates by complete row
df.duplicated().sum()

0

#### There's no complete duplicated rows

### Fin duplicated rows for year and quarter ('Trimestre')

In [15]:
#Find duplicated rows by row, for year and quarter
df.duplicated(subset=['Año', 'Trimestre']).sum()

0

#### There's no duplicated rows for year and quarter

## 4. Finding outliers

In [16]:
#Finding outliers
df.describe()

Unnamed: 0,Año,Trimestre,Accesos por cada 100 hogares,Accesos por cada 100 hab
count,42.0,42.0,42.0,42.0
mean,2018.76,2.45,63.2,19.57
std,3.07,1.13,9.95,3.27
min,2014.0,1.0,49.55,15.05
25%,2016.0,1.25,53.78,16.52
50%,2019.0,2.0,62.67,19.4
75%,2021.0,3.0,72.53,22.6
max,2024.0,4.0,78.89,24.79


#### In the statistics, we can see there's no outliers values for the column year ("Año"), because the maximum value is 2024 and the data is historical. For the quarter ("Trimestre") column the minimum an maximum values are correct, 1 and 4. For the cuantitative columns Accesos por cada 100 hogares and Accesos por cada 100 habitantes, the min and max values are consitent for the mean and standard deviation. No outlies detected.

In [17]:
df.groupby(['Año','Trimestre']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accesos por cada 100 hogares,Accesos por cada 100 hab,Periodo
Año,Trimestre,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,1,49.55,15.05,Ene-Mar 2014
2014,2,49.86,15.16,Abr-Jun 2014
2014,3,50.67,15.43,Jul-Sept 2014
2014,4,50.5,15.39,Oct-Dic 2014
2015,1,51.36,15.68,Ene-Mar 2015
2015,2,51.76,15.82,Abr-Jun 2015
2015,3,52.46,16.05,Jul-Sept 2015
2015,4,52.63,16.12,Oct-Dic 2015
2016,1,51.85,15.9,Ene-Mar 2016
2016,2,53.34,16.37,Abr-Jun 2016


#### All pairs year-quarter are present, there's no missing data.

#### All columns have consistent growth pattern. This will be explored in detail in the EDA section. The growth patter is not consistent with the expotential behaviour of incomes dataset, that reafirm the hipotesys that the incomes behaivour is given by inflation and devaluations.

## 5. Data Types

### We will review the data types for each column.

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Año                           42 non-null     int64  
 1   Trimestre                     42 non-null     int64  
 2   Accesos por cada 100 hogares  42 non-null     float64
 3   Accesos por cada 100 hab      42 non-null     float64
 4   Periodo                       42 non-null     object 
dtypes: float64(2), int64(2), object(1)
memory usage: 1.8+ KB


#### The data types are consistent with the data provided. Just the column "Periodo" have an object data type, and have strings values. We should change the data type to string, but since the data is redundant with the quarter ("Periodo") column, we will drop it.

In [19]:
#drop Periodo column
df = df.drop('Periodo', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Año                           42 non-null     int64  
 1   Trimestre                     42 non-null     int64  
 2   Accesos por cada 100 hogares  42 non-null     float64
 3   Accesos por cada 100 hab      42 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 1.4 KB


## New columns

### Create a new column with the year and quarter

In [20]:
#Create a new column with the quarter and year
df['Periodo'] = (df['Año'].astype(str) + 'T' + df['Trimestre'].astype(str)).astype('string')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Año                           42 non-null     int64  
 1   Trimestre                     42 non-null     int64  
 2   Accesos por cada 100 hogares  42 non-null     float64
 3   Accesos por cada 100 hab      42 non-null     float64
 4   Periodo                       42 non-null     string 
dtypes: float64(2), int64(2), string(1)
memory usage: 1.8 KB


# Save dataset

In [21]:
df.to_parquet(ROOT_DIR + '/data_files/penetracion_totales_clean.parquet')