#### Primer Ejemplo usando libreria urlib3 para leer los datos desde una URL externa procesarlos y convertirlos a un dataframe de python y guardar copias con extencion (CSV, JSON, XLSX)

In [1]:
# Paso 1: Importando la libreria urlib3
import urllib3
http = urllib3.PoolManager()
import pandas as pd
import os

In [2]:
# Paso 1.1: Realizando conexion con la web de los datos. 

#data = "https://datahub.io/five-thirty-eight/comic-characters/r/1.html"
data = "https://data.wa.gov/api/views/f6w7-q2d2/rows.csv"
rsp = http.request('GET', data)
print (f'El estado de la respuesta es: {rsp.status}')
response = rsp.data

El estado de la respuesta es: 200


In [3]:
# Paso 2: Convirtiendo el objeto binario decodificado a UTF-8
str_data = response.decode('utf-8')

In [7]:
# Paso 3: Dividiendo el String en un array de filas, separado por intros.
lines = str_data.split("\n")
lines.pop()
len(lines)

143597

In [8]:
# Paso 4: Se extrae la primera linea que contiene la cabecera.
col_names = lines[0].split(",")
n_cols = len(col_names)
n_cols

17

In [9]:
# Paso 5: Generando un diccionario vacio donde se adicionara la informacion procesada desde la URL.
main_dict = {}
for col in col_names:
    main_dict[col] = []

In [10]:
# Paso 6: Procesando fila a fila la informacion para llenar el diccionario.
# En la linea 7 nos saltamos la primera linea que corresponde al encabezado.
# En la linea 8 se divide cada string por las comas(,) que es el elemento que lo separa.
# En la linea 9 se añade cada valor a la respectiva columna del diccionario.
counter = 0
for line in lines:
    if (counter > 0):
        values = line.strip().split(",")
        for i in range(len(col_names)):
            main_dict[col_names[i]].append(values[i])
    counter += 1

print("El data set tiene %d filas y %d columnas" %(counter, n_cols))

El data set tiene 143597 filas y 17 columnas


In [11]:
# Paso 7: Convirtiendo el diccionario a DataFrame, comprobando los datos.
df = pd.DataFrame(main_dict)
df.head()

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5UXTA6C03P,King,Seattle,WA,98177,2023,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30,0,36,218985539,POINT (-122.38242499999996 47.77279000000004),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033001600
1,1FMCU0EZXN,Yakima,Moxee,WA,98936,2022,FORD,ESCAPE,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,15,197264322,POINT (-120.37951169999997 46.55609000000004),PACIFICORP,53077001702
2,1G1FW6S03J,King,Seattle,WA,98117,2018,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,36,168549727,POINT (-122.37275999999997 47.689685000000054),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033003000
3,5YJSA1AC0D,King,Newcastle,WA,98059,2013,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,208,69900,41,244891062,POINT (-122.15733999999998 47.487175000000036),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033025005
4,1FADP5CU8F,Kitsap,Bremerton,WA,98312,2015,FORD,C-MAX,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,19,0,26,134915000,POINT (-122.65223 47.57192),PUGET SOUND ENERGY INC,53035081100


In [19]:
# Paso 8: Guardando ficheros en local, en formatos (CSV,JSON,XLSX)
# Creando variables para cargar rutas en variables.
mainpath = "/proyectos/learning/MachineLearning/datasets/outfiles/"
filename = "Bat_Elec_V_df"
# combinando fullpath con libreria os.
fullpath = os.path.join(mainpath, filename)

In [20]:
# Guardando archivos con formatos (CSV,JSON,XLSX) en ruta local /outfiles
df.to_csv(fullpath+".csv")
df.to_json(fullpath+".json")
df.to_excel(fullpath+".xlsx")

### Creando la funcion downloadFromURL() con el codigo Anterior

In [21]:
# CReando funcion downloadFromURL

def downloadFromURL(url, decode,lines_split,split,mainpath,filename):
    import urllib3
    http = urllib3.PoolManager()
    import pandas as pd
    import os

    counter = 0
    main_dict = {} 
    rsp = http.request('GET', url)
    response = rsp.data
    
    str_data = response.decode(decode)
    lines = str_data.split(lines_split)
    lines.pop() # Activar solo en caso que el archivo contenga una linea vacia al final.

    col_names = lines[0].split(split)
    n_cols = len(col_names)
    for col in col_names:
        main_dict[col] = []

    for line in lines:
        if (counter > 0):
            values = line.strip().split(",")
            for i in range(len(col_names)):
                main_dict[col_names[i]].append(values[i])
        counter += 1
    
    df = pd.DataFrame(main_dict)

    fullpath = os.path.join(mainpath, filename)
    df.to_csv(fullpath+".csv")
    df.to_json(fullpath+".json")
    #df.to_excel(fullpath+".xlsx")
    print(f'Archivos generados y listos en la ruta: '+ fullpath)
    
    return df

In [149]:
# CReando variables
url = "https://data.wa.gov/api/views/f6w7-q2d2/rows.csv"
decode = 'utf-8'
lines_split = "\n"
split = ","
mainpath = "/proyectos/learning/MachineLearning/datasets/outfiles/"
filename = "Bat_Elec_df"

In [150]:
# Invocando funcion downloadFromURL con parametros.

Bat_Elec_V_df = downloadFromURL(url,decode,lines_split,split,mainpath,filename)
Bat_Elec_V_df.head()

Archivos generados y listos en la ruta: /proyectos/learning/MachineLearning/datasets/outfiles/Bat_Elec_df


Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5UXTA6C03P,King,Seattle,WA,98177,2023,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,30,0,36,218985539,POINT (-122.38242499999996 47.77279000000004),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033001600
1,1FMCU0EZXN,Yakima,Moxee,WA,98936,2022,FORD,ESCAPE,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,38,0,15,197264322,POINT (-120.37951169999997 46.55609000000004),PACIFICORP,53077001702
2,1G1FW6S03J,King,Seattle,WA,98117,2018,CHEVROLET,BOLT EV,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,36,168549727,POINT (-122.37275999999997 47.689685000000054),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033003000
3,5YJSA1AC0D,King,Newcastle,WA,98059,2013,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,208,69900,41,244891062,POINT (-122.15733999999998 47.487175000000036),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033025005
4,1FADP5CU8F,Kitsap,Bremerton,WA,98312,2015,FORD,C-MAX,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,19,0,26,134915000,POINT (-122.65223 47.57192),PUGET SOUND ENERGY INC,53035081100
