In [None]:
import pandas as pd
import boto3
import zipfile
from datetime import date
import subprocess

In [118]:
def runcmd(cmd, verbose = False, *args, **kwargs):

    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass

In [119]:
session = boto3.session.Session()

In [120]:
s3 = session.resource(service_name='s3')

In [121]:
bucketname = 'lems-spot-geodata-dev'
data_folder = 'data/raw'
bucket = s3.Bucket(bucketname)

In [122]:
source_files = ["https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46111_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46112-46311_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46321-46531_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46591-46911_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_43_csv.zip"]

In [123]:
runcmd(f"rm -R /data")
! mkdir -p /data

In [124]:
i = 1
for url_file in source_files:
    print(f"Downloading file: {url_file}")
    runcmd(f"wget -O /data/denue_{i}.zip {url_file}", verbose = False)
    i += 1

Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46111_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46112-46311_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46321-46531_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46591-46911_csv.zip


In [125]:
for i in range(len(source_files)):
    with zipfile.ZipFile(f"/data/denue_{i+1}.zip","r") as zip_ref:
        zip_ref.extractall(f"/data/denue_{i+1}/")
    runcmd(f"mv /data/denue_{i+1}/conjunto_de_datos/*.csv /data/denue_{i+1}/conjunto_de_datos/denue_{i+1}.csv")
print('Unzipped files successfully')

Unzipped files successfully


In [126]:
! ls -lart /data

total 192712
-rw-r--r-- 1 root root 51584880 Dec 16 22:00 denue_1.zip
-rw-r--r-- 1 root root 48430218 Dec 16 22:00 denue_2.zip
-rw-r--r-- 1 root root 49738819 Dec 16 22:00 denue_3.zip
-rw-r--r-- 1 root root 47579045 Dec 16 22:00 denue_4.zip
drwxr-xr-x 1 root root      127 May 24 00:43 ..
drwxr-xr-x 5 root root       76 May 24 00:45 denue_1
drwxr-xr-x 5 root root       76 May 24 00:45 denue_2
drwxr-xr-x 5 root root       76 May 24 00:45 denue_3
drwxr-xr-x 6 root root      142 May 24 00:45 .
drwxr-xr-x 5 root root       76 May 24 00:45 denue_4


In [127]:
s3 = boto3.client("s3")
today = str(date.today())
print(today)
directory_name = today
s3.put_object(Bucket=bucketname, Key=(f'{data_folder}/'+directory_name+'/'))

2022-05-24


{'ResponseMetadata': {'RequestId': 'XT1AYGSGQ4RH3AKE',
  'HostId': '8seoH8Kt5vnhdlTQaMnBN8nx7Utc5gYeY5eViyZazxiw+bqPv9ZgRrR7M16XMjcHTaEGpzzp7y4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '8seoH8Kt5vnhdlTQaMnBN8nx7Utc5gYeY5eViyZazxiw+bqPv9ZgRrR7M16XMjcHTaEGpzzp7y4=',
   'x-amz-request-id': 'XT1AYGSGQ4RH3AKE',
   'date': 'Tue, 24 May 2022 00:45:13 GMT',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"'}

In [128]:
for i in range(len(source_files)):
    s3.upload_file(
        Filename=f"/data/denue_{i+1}/conjunto_de_datos/denue_{i+1}.csv",
        Bucket=bucketname,
        Key=f"{data_folder}/{directory_name}/denue_{i+1}.csv",
    )
print('Ingestion to S3 was finished successfully')

Ingestion to S3 was finished successfully
