### Brazilian funds allocation
Data source: http://dados.cvm.gov.br/dataset/fi-doc-cda

Data description: http://cvmweb.cvm.gov.br/SWB/Sistemas/SCW/PadroesXML/PadraoXMLCDANetV4.aspx

Data description version: 4.0

Download date: 2018-09-21

In this script:
* load zip files
* check for columns inconsistencies along files
* merge data into BLC and PL dataframes. For the BLC, it is divided by block number
* save dataframes

In [1]:
# Input
date = '2018_09_21'
homePath = 'C:/Users/Mamed/Python4DS/'
projPath = homePath + 'FundsBR/'
dataPath = projPath + 'Data_' + date + '/'
unzipTemp = projPath + 'unzipTemp/'
strucPath = projPath + 'Structures_' + date + '/'
libsPath = projPath + 'Libs/'

In [2]:
import pandas as pd
import numpy as np
import zipfile
import os

In [3]:
# Load my libraries
exec(open(libsPath + 'fundsLib.py').read())

In [4]:
# Unzip data
# Unziped files will be created at 'unzipTemp' directory

# Create directory to unzip data
if not os.path.exists(unzipTemp):
    os.makedirs(unzipTemp)
else:
    print('Error: folder already existis.')
    
# Unzip
zFiles = [f for f in os.listdir(dataPath) if os.path.isfile(os.path.join(dataPath, f))]
for f in zFiles:
    zip_ref = zipfile.ZipFile(dataPath + f, 'r')
    zip_ref.extractall(unzipTemp)
    zip_ref.close()

In [5]:
# Check files names

unzFiles = [f for f in os.listdir(unzipTemp) if os.path.isfile(os.path.join(unzipTemp, f))]
for f in unzFiles:
    
    s = f.split('_')
    
    # Check CDA, FI, BLC and PL
    if (s[0] != 'cda') | (s[1] != 'fi') | (not ((s[2] != 'BLC') | (s[2] != 'PL'))): 
        raise ValueError('Wrong file name: %s' % f)
    
    # BLC
    if s[2] == 'BLC':
        
        # Block
        if int(s[3]) not in list(range(1,9)):
            raise ValueError('Block number error: %s' % f)
        
        # Year and CSV
        sBLC = s[4].split('.')
        if len(sBLC[0]) == 4 & (int(sBLC[0]) not in list(range(2005,2018))):
            raise ValueError('Year error: %s' % f)
        if len(sBLC[0]) == 5 & (int(sBLC[0][:4]) != 2018):
            raise ValueError('Year error: %s' % f)
        if sBLC[1] != 'csv':
            raise ValueError('File type error: %s' % f)
     
    # PL
    if s[2] == 'PL':
        
        # Year and CSV
        sPL = s[3].split('.')
        if len(sPL[0]) == 4 & (int(sPL[0]) not in list(range(2005,2018))):
            raise ValueError('Year error: %s' % f)
        if len(sPL[0]) == 5 & (int(sPL[0][:4]) != 2018):
            raise ValueError('Year error: %s' % f)
        if sPL[1] != 'csv':
            raise ValueError('File type error: %s' % f)

In [6]:
# Check columns conformity for BLC type

blc1Head = []
blc2Head = []
blc3Head = []
blc4Head = []
blc5Head = []
blc6Head = []
blc7Head = []
blc8Head = []

for f in unzFiles:
    s = f.split('_')
    
    # Type (BLC or PL)
    fType = s[2]
    
    # For BLC type
    if fType == 'BLC':
        
        # Block
        fBlk = int(s[3])
        
        # Year
        fYear = s[4].split('.')[0]
        
        if len(fYear) == 4: # Year <= 2017
            fMonth = None
            fYear = int(fYear)
        
        else: # Year = 2018
            fMonth = int(fYear[-2:])
            fYear = int(fYear[:4])
        
        # Read file head
        df_ = pd.read_csv(unzipTemp + f, sep = ';', 
                 encoding = 'ISO-8859-1', 
                 low_memory = False, 
                 nrows = 1)
        
        # Append head to respective block
        if fBlk == 1: blc1Head.append(list(df_.keys()))
        if fBlk == 2: blc2Head.append(list(df_.keys()))
        if fBlk == 3: blc3Head.append(list(df_.keys()))
        if fBlk == 4: blc4Head.append(list(df_.keys()))
        if fBlk == 5: blc5Head.append(list(df_.keys()))
        if fBlk == 6: blc6Head.append(list(df_.keys()))
        if fBlk == 7: blc7Head.append(list(df_.keys()))
        if fBlk == 8: blc8Head.append(list(df_.keys()))

# Arrange
blc1Head = np.array(blc1Head)
blc2Head = np.array(blc2Head)
blc3Head = np.array(blc3Head)
blc4Head = np.array(blc4Head)
blc5Head = np.array(blc5Head)
blc6Head = np.array(blc6Head)
blc7Head = np.array(blc7Head)
blc8Head = np.array(blc8Head)

# Check if files from respective blocks have the same head
def checkCols(col):
    return(all(x == col[0] for x in col))

chk1 = all(np.apply_along_axis(checkCols, 0, blc1Head))
chk2 = all(np.apply_along_axis(checkCols, 0, blc2Head))
chk3 = all(np.apply_along_axis(checkCols, 0, blc3Head))
chk4 = all(np.apply_along_axis(checkCols, 0, blc4Head))
chk5 = all(np.apply_along_axis(checkCols, 0, blc5Head))
chk6 = all(np.apply_along_axis(checkCols, 0, blc6Head))
chk7 = all(np.apply_along_axis(checkCols, 0, blc7Head))
chk8 = all(np.apply_along_axis(checkCols, 0, blc8Head))

print(chk1, chk2, chk3, chk4, chk5, chk6, chk7, chk8)

True True True True True True True True


In [7]:
# Check columns conformity for PL type

blcHead = []


for f in unzFiles:
    s = f.split('_')
    
    # Type (BLC or PL)
    fType = s[2]
    
    # For PL type
    if fType == 'PL':
        
        # Year
        fYear = s[3].split('.')[0]
        
        if len(fYear) == 4: # Year <= 2017
            fMonth = None
            fYear = int(fYear)
        
        else: # Year = 2018
            fMonth = int(fYear[-2:])
            fYear = int(fYear[:4])
        
        # Read file head
        df_ = pd.read_csv(unzipTemp + f, sep = ';', 
                 encoding = 'ISO-8859-1', 
                 low_memory = False, 
                 nrows = 1)
        
        # Append head to respective block
        blcHead.append(list(df_.keys()))


# Arrange
blcHead = np.array(blcHead)

# # Check if files from respective blocks have the same head
def checkCols(col):
    return(all(x == col[0] for x in col))

chk = all(np.apply_along_axis(checkCols, 0, blcHead))

print(chk1, chk2, chk3, chk4, chk5, chk6, chk7, chk8)

True True True True True True True True


In [None]:
# Create directory to save structured data
if not os.path.exists(strucPath):
    os.makedirs(strucPath)

    # Read BLC files, append blocks in dataframes and save
    print('BLC data:')
    for blk in range(8, 8+1):
        dfBlk = readBlcBlock(blk = blk, folder = unzipTemp)
        print(blk, dfBlk.shape)
        dfBlk.to_pickle(strucPath + 'BLC_blk_' + str(blk) + '_.pkl')
    
else:
    print('Error: folder already existis.')

BLC data:
8 2005 None
8 2006 None
8 2007 None
8 2008 None
8 2009 None
8 2010 None
8 2011 None
8 2012 None
8 2013 None
8 2014 None
8 2015 None
8 2016 None
8 2017 None
8 2018 1
8 2018 2
8 2018 3
8 2018 4
8 2018 5
8 2018 6
8 2018 7
8 2018 8


In [13]:
blk

8

In [None]:
# Read PL files, append blocks in dataframes and save
dfPL = readPL(folder = unzipTemp)
print(dfPL.shape)
dfPL.to_pickle(strucPath + 'PL_.pkl')

In [None]:
# Delete temporary unziped files and folder
shutil.rmtree(unzipTemp)