## Load libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append("../libs")
sys.path.append("../")
from definitions import ROOT_DIR
import utils as utils

# Set pandas options
# Not use scientific notation and use two decimal places and use comma as thousand separator
pd.options.display.float_format = '{:,.2f}'.format


# ETL

## 1. Load dataset

In [19]:
# Load dataset from data_files/internet.xlsx - sheet: 'Ingresos '
df = utils.get_xls_sheet_data('/data_files/internet.xlsx', 'Accesos Por Tecnología')
df.head().style.format(thousands=',')

Unnamed: 0,Año,Trimestre,Provincia,ADSL,Cablemodem,Fibra óptica,Wireless,Otros,Total
0,2024,2,Buenos Aires,214055.0,2722466.0,1849476.0,138638.0,64745.0,4989380.0
1,2024,2,Capital Federal,54102.0,1144781.0,230402.0,4493.0,29821.0,1463599.0
2,2024,2,Catamarca,4951.0,10303.0,58355.0,1384.0,81.0,75074.0
3,2024,2,Chaco,9448.0,57935.0,68944.0,8407.0,2358.0,147092.0
4,2024,2,Chubut,25955.0,80704.0,26516.0,31118.0,9930.0,174223.0


## 2. Handling missing values

### Look for missing values in all cells

In [20]:
#Find missing values
df.isnull().sum()

Año             1
Trimestre       1
Provincia       2
ADSL            2
Cablemodem      2
Fibra óptica    2
Wireless        2
Otros           2
Total           2
dtype: int64

#### There are 16 missed values, next we will analyse them and decide how to handle them

In [21]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Año,Trimestre,Provincia,ADSL,Cablemodem,Fibra óptica,Wireless,Otros,Total
1007,,,,,,,,,
1008,*,Los datos provinciales no coinciden a nivel na...,,,,,,,


#### All missed values are concentrated in 2 rows, the row index 1007 correspond to "Buenos Aires" T1 year 2024. And 1008 are empty in the original datasource so we can drop it.

In [22]:
#fill missing values with 0 for 1007 index

df.loc[1007] = [2024, 1, 'Buenos Aires', 218723, 2760083, 1817035, 138331, 66344, 5000500]

In [23]:
#remove missing values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 1007
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Año           1008 non-null   object 
 1   Trimestre     1008 non-null   object 
 2   Provincia     1008 non-null   object 
 3   ADSL          1008 non-null   float64
 4   Cablemodem    1008 non-null   float64
 5   Fibra óptica  1008 non-null   float64
 6   Wireless      1008 non-null   float64
 7   Otros         1008 non-null   float64
 8   Total         1008 non-null   float64
dtypes: float64(6), object(3)
memory usage: 78.8+ KB


## 3. Look for Duplicates

### Find duplicates for complete rows

In [24]:
#Find duplicates by complete row
df.duplicated().sum()

0

#### There's no complete duplicated rows

### Fin duplicated rows for year and quarter ('Trimestre') and state ('Provincia')

In [25]:
#Find duplicated rows by row, for year and quarter
df.duplicated(subset=['Año', 'Trimestre', 'Provincia']).sum()

0

#### There's no duplicated rows for year, quarter and state

## 4. Data Types

### We will review the data types for each column.

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 1007
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Año           1008 non-null   object 
 1   Trimestre     1008 non-null   object 
 2   Provincia     1008 non-null   object 
 3   ADSL          1008 non-null   float64
 4   Cablemodem    1008 non-null   float64
 5   Fibra óptica  1008 non-null   float64
 6   Wireless      1008 non-null   float64
 7   Otros         1008 non-null   float64
 8   Total         1008 non-null   float64
dtypes: float64(6), object(3)
memory usage: 78.8+ KB


#### The 'Año', 'Trimestre' and 'Provincia' columns have an object data type, and have strings values. We should change the data type to string.

In [27]:
# Review unique data
df['Año'].unique()

array([2024, 2023, 2022, 2021, 2020, 2019, '2019 *', 2018, 2017, 2016,
       2015, 2014], dtype=object)

#### For quarters 1 to 3 for year 2019 we have the value 2019*, this means the data in not realiable because the values where misinformed from the source. I think the best solution is to copy the 4th quarter values for the year 2019.

In [28]:
#select Año = '2019' and Trimestre = '1'
ok_2019 = df[df['Año'] == 2019].copy()
ok_2019['Trimestre'] = '3'
df = pd.concat([df, ok_2019])

ok_2019['Trimestre'] = '2'
df = pd.concat([df, ok_2019])

ok_2019['Trimestre'] = '1'
df = pd.concat([df, ok_2019])

# drop rows where 'Año' = '2019 *'
df = df.drop(df[df['Año'] == '2019 *'].index)

df.groupby(['Año','Trimestre'])['Provincia'].agg(['count'])


Unnamed: 0_level_0,Unnamed: 1_level_0,count
Año,Trimestre,Unnamed: 2_level_1
2014,1,24
2014,2,24
2014,3,24
2014,4,24
2015,1,24
2015,2,24
2015,3,24
2015,4,24
2016,1,24
2016,2,24


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 454
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Año           1008 non-null   object 
 1   Trimestre     1008 non-null   object 
 2   Provincia     1008 non-null   object 
 3   ADSL          1008 non-null   float64
 4   Cablemodem    1008 non-null   float64
 5   Fibra óptica  1008 non-null   float64
 6   Wireless      1008 non-null   float64
 7   Otros         1008 non-null   float64
 8   Total         1008 non-null   float64
dtypes: float64(6), object(3)
memory usage: 78.8+ KB


#### We needs to change the data type to string for 'Provincia' and integer for 'Año' and 'Trimestre' columns.

In [30]:
df['Año'] = df['Año'].astype('int64')
df['Trimestre'] = df['Trimestre'].astype('int64')
df['Provincia'] = df['Provincia'].astype('string')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 454
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Año           1008 non-null   int64  
 1   Trimestre     1008 non-null   int64  
 2   Provincia     1008 non-null   string 
 3   ADSL          1008 non-null   float64
 4   Cablemodem    1008 non-null   float64
 5   Fibra óptica  1008 non-null   float64
 6   Wireless      1008 non-null   float64
 7   Otros         1008 non-null   float64
 8   Total         1008 non-null   float64
dtypes: float64(6), int64(2), string(1)
memory usage: 78.8 KB


## 5. Finding outliers

In [31]:
#Finding outliers
df.describe()

Unnamed: 0,Año,Trimestre,ADSL,Cablemodem,Fibra óptica,Wireless,Otros,Total
count,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0
mean,2018.76,2.45,110525.34,185490.05,51616.0,12763.99,6500.71,366896.07
std,3.04,1.12,229440.01,454732.85,186501.73,21380.6,11277.12,788194.42
min,2014.0,1.0,2230.0,0.0,0.0,0.0,2.0,12557.0
25%,2016.0,1.0,19521.25,11015.0,355.0,814.0,341.5,62418.25
50%,2019.0,2.0,43967.5,40892.5,3224.5,5245.0,1901.0,113834.5
75%,2021.0,3.0,88040.5,77159.25,31092.75,14531.0,7622.0,196158.5
max,2024.0,4.0,1586343.0,2797700.0,1849476.0,138638.0,73415.0,5011620.0


#### In the statistics, we can see there's no outliers values for the column year ("Año"), because the maximum value is 2024 and the data is historical. For the quarter ("Trimestre") column the minimum an maximum values are correct, 1 and 4. We have 0 values in dome technologies, this maybe means that technolofy isn't available in that province in that year, i.e. "Cable Modem" in "La Rioja" province from 2014 to 2017. No outlies detected.

In [32]:
df.groupby(['Año','Trimestre'])['Provincia'].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Año,Trimestre,Unnamed: 2_level_1
2014,1,24
2014,2,24
2014,3,24
2014,4,24
2015,1,24
2015,2,24
2015,3,24
2015,4,24
2016,1,24
2016,2,24


#### All pairs year-quarter are present, there's no missing data for each of the 24 provinces.

## New columns

### Create a new column with the year and quarter

In [33]:
#Create a new column with the quarter and year
df['Periodo'] = (df['Año'].astype(str) + 'T' + df['Trimestre'].astype(str)).astype('string')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 454
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Año           1008 non-null   int64  
 1   Trimestre     1008 non-null   int64  
 2   Provincia     1008 non-null   string 
 3   ADSL          1008 non-null   float64
 4   Cablemodem    1008 non-null   float64
 5   Fibra óptica  1008 non-null   float64
 6   Wireless      1008 non-null   float64
 7   Otros         1008 non-null   float64
 8   Total         1008 non-null   float64
 9   Periodo       1008 non-null   string 
dtypes: float64(6), int64(2), string(2)
memory usage: 86.6 KB


# Save dataset

In [34]:
df=df.reset_index()
df.to_parquet(ROOT_DIR + '/data_files/accesos_por_tecnologia_por_provincia_clean.parquet')