In [3]:
import pandas as pd
import boto3
import zipfile
from datetime import date
import subprocess

In [4]:
def runcmd(cmd, verbose = False, *args, **kwargs):

    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass

In [5]:
session = boto3.session.Session()

In [6]:
s3 = session.resource(service_name='s3')

In [7]:
bucketname = 'lems-spot-geodata-dev'
data_folder = 'data/raw'
bucket = s3.Bucket(bucketname)

In [8]:
source_files = ["https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46111_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46112-46311_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46321-46531_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46591-46911_csv.zip", "https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_43_csv.zip"]

In [9]:
runcmd(f"rm -R /data")
! mkdir -p /data

In [10]:
i = 1
for url_file in source_files:
    print(f"Downloading file: {url_file}")
    runcmd(f"wget -O /data/denue_{i}.zip {url_file}", verbose = False)
    i += 1

Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46111_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46112-46311_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46321-46531_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_46591-46911_csv.zip
Downloading file: https://www.inegi.org.mx/contenidos/masiva/denue/denue_00_43_csv.zip


In [11]:
for i in range(len(source_files)):
    with zipfile.ZipFile(f"/data/denue_{i+1}.zip","r") as zip_ref:
        zip_ref.extractall(f"/data/denue_{i+1}/")
    runcmd(f"mv /data/denue_{i+1}/conjunto_de_datos/*.csv /data/denue_{i+1}/conjunto_de_datos/denue_{i+1}.csv")
print('Unzipped files successfully')

Unzipped files successfully


In [12]:
! ls -lart /data

total 209960
-rw-r--r-- 1 root root 17661624 Dec 16 22:00 denue_5.zip
-rw-r--r-- 1 root root 51584880 Dec 16 22:00 denue_1.zip
-rw-r--r-- 1 root root 48430218 Dec 16 22:00 denue_2.zip
-rw-r--r-- 1 root root 49738819 Dec 16 22:00 denue_3.zip
-rw-r--r-- 1 root root 47579045 Dec 16 22:00 denue_4.zip
drwxr-xr-x 1 root root       62 May 25 01:16 ..
drwxr-xr-x 5 root root       76 May 25 01:18 denue_1
drwxr-xr-x 5 root root       76 May 25 01:18 denue_2
drwxr-xr-x 5 root root       76 May 25 01:18 denue_3
drwxr-xr-x 5 root root       76 May 25 01:18 denue_4
drwxr-xr-x 7 root root      176 May 25 01:18 .
drwxr-xr-x 5 root root       76 May 25 01:18 denue_5


In [13]:
s3 = boto3.client("s3")
today = str(date.today())
print(today)
directory_name = today
s3.put_object(Bucket=bucketname, Key=(f'{data_folder}/'+directory_name+'/'))

2022-05-25


{'ResponseMetadata': {'RequestId': 'GQXTY8ABJ9B6TV9C',
  'HostId': 'uO4zG1UP0a1EdzBV41ueODmhX6bZepEUDC6oza8pIui7Sy44DYLc0w47WWSkiwiu31tfdGpamH0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'uO4zG1UP0a1EdzBV41ueODmhX6bZepEUDC6oza8pIui7Sy44DYLc0w47WWSkiwiu31tfdGpamH0=',
   'x-amz-request-id': 'GQXTY8ABJ9B6TV9C',
   'date': 'Wed, 25 May 2022 01:18:51 GMT',
   'etag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"'}

In [14]:
for i in range(len(source_files)):
    s3.upload_file(
        Filename=f"/data/denue_{i+1}/conjunto_de_datos/denue_{i+1}.csv",
        Bucket=bucketname,
        Key=f"{data_folder}/{directory_name}/denue_{i+1}.csv",
    )
print('Ingestion to S3 was finished successfully')

Ingestion to S3 was finished successfully
