## Load libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append("../libs")
sys.path.append("../")
from definitions import ROOT_DIR
import utils as utils

# Set pandas options
# Not use scientific notation and use two decimal places and use comma as thousand separator
pd.options.display.float_format = '{:,.2f}'.format

# ETL

## 1. Load dataset

In [13]:
# Load dataset from data_files/internet.xlsx - sheet: 'Ingresos '
df = utils.get_xls_sheet_data('/data_files/internet.xlsx', 'Totales VMD')
df.head()

Unnamed: 0,Año,Trimestre,Mbps (Media de bajada),Trimestre.1
0,2024,2,139.25,Abr-Jun 2024
1,2024,1,139.15,Ene-Mar 2024
2,2023,4,139.04,Oct-Dic 2023
3,2023,3,129.67,Jul-Sept 2023
4,2023,2,123.95,Abr-Jun 2023


## 2. Handling missing values

### Look for missing values in all cells

In [14]:
#Find missing values
df.isnull().sum()

Año                       0
Trimestre                 0
Mbps (Media de bajada)    0
Trimestre.1               0
dtype: int64

#### There's no missing values

## 3. Look for Duplicates

### Find duplicates for complete rows

In [15]:
#Find duplicates by complete row
df.duplicated().sum()

0

#### There's no complete duplicated rows

### Fin duplicated rows for year and quarter ('Trimestre')

In [16]:
#Find duplicated rows by row, for year and quarter
df.duplicated(subset=['Año', 'Trimestre']).sum()

0

#### There's no duplicated rows for year and quarter

## 4. Finding outliers

In [17]:
#Finding outliers
df.describe()

Unnamed: 0,Año,Trimestre,Mbps (Media de bajada)
count,42.0,42.0,42.0
mean,2018.76,2.45,39.33
std,3.07,1.13,43.64
min,2014.0,1.0,3.62
25%,2016.0,1.25,6.04
50%,2019.0,2.0,18.85
75%,2021.0,3.0,51.37
max,2024.0,4.0,139.25


#### In the statistics, we can see there's no outliers values for the column year ("Año"), because the maximum value is 2024 and the data is historical. For the quarter ("Trimestre") column the minimum an maximum values are correct, 1 and 4.

In [18]:
df.groupby(['Año','Trimestre']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mbps (Media de bajada),Trimestre.1
Año,Trimestre,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,1,3.62,Ene-Mar 2014
2014,2,3.76,Abr-Jun 2014
2014,3,3.87,Jul-Sept 2014
2014,4,4.16,Oct-Dic 2014
2015,1,4.35,Ene-Mar 2015
2015,2,4.55,Abr-Jun 2015
2015,3,4.79,Jul-Sept 2015
2015,4,4.99,Oct-Dic 2015
2016,1,5.08,Ene-Mar 2016
2016,2,5.42,Abr-Jun 2016


#### All pairs year-quarter are present, there's no missing data.

#### The behaviour of the BMPS ("Media de bajada"), have a consistent growth. Although the fourth quarter of 2022 almost doubles the previous quarter, the growth continues in the following years. This will be explored in detail in the EDA section.

## 5. Data Types

### We will review the data types for each column.

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Año                     42 non-null     int64  
 1   Trimestre               42 non-null     int64  
 2   Mbps (Media de bajada)  42 non-null     float64
 3   Trimestre.1             42 non-null     object 
dtypes: float64(1), int64(2), object(1)
memory usage: 1.4+ KB


#### The data types are consistent with the data provided. Just the column "Trimestre.1" have an object data type, and have strings values. We should change the data type to string, but since the data is redundant with the quarter ("trimestre") column, we will drop it.

In [20]:
#drop Periodo column
df = df.drop('Trimestre.1', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Año                     42 non-null     int64  
 1   Trimestre               42 non-null     int64  
 2   Mbps (Media de bajada)  42 non-null     float64
dtypes: float64(1), int64(2)
memory usage: 1.1 KB


## New columns

### Create a new column with the year and quarter

In [21]:
#Create a new column with the quarter and year
df['Periodo'] = df['Año'].astype(str) + 'T' + df['Trimestre'].astype(str)
df.head()

Unnamed: 0,Año,Trimestre,Mbps (Media de bajada),Periodo
0,2024,2,139.25,2024T2
1,2024,1,139.15,2024T1
2,2023,4,139.04,2023T4
3,2023,3,129.67,2023T3
4,2023,2,123.95,2023T2


# Save dataset

In [22]:
df.to_parquet(ROOT_DIR + '/data_files/totales_vmd_clean.parquet')