In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import scipy as sp
import datetime as dt
import os
import utils as u

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
FILE_PATH = os.path.abspath(os.curdir+"\\..\\data-processing\\data\\funds.pkl")
funds_df_raw = pd.read_pickle(FILE_PATH)

In [3]:
funds_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Ticker                         213 non-null    object        
 1   Nome                           213 non-null    object        
 2   Administrador                  213 non-null    object        
 3   Descrição                      190 non-null    object        
 4   Data de Constituição do Fundo  87 non-null     datetime64[ns]
 5   Cotas Emitidas                 212 non-null    float64       
 6   Tipo de Gestão                 204 non-null    object        
 7   Público Alvo                   209 non-null    object        
 8   Mandato                        204 non-null    object        
 9   Segmento                       204 non-null    object        
 10  Prazo de Duração               213 non-null    object        
 11  Taxa de Administraç

In [4]:
cat_columns = ['Administrador', 'Tipo de Gestão', 'Público Alvo', 'Mandato', 'Segmento', 'Prazo de Duração']
text_columns = ['Descrição', 'Taxa de Administração']

# stores the categories information after vectorization
cat_manager = u.CategoryManager()

pipeline = Pipeline([   
    ('clean-description-headers', u.CleanHeaders(col=3)),
    ('clean-description-punct', u.CleanPunct(col=3)),
    
    ('input-const-admin-tax', u.FillColumn(col=11, method='const', const='0,2% a.a.')),
    
    ('clean-admin-tax-headers', u.CleanHeaders(col=11)),
    ('clean-admin-tax-punct', u.CleanPunct(col=11)),
    
    ('input-foundation-date', u.InputDate(col=4, ref_col=16)),
    
    ('input-mean-daily-liquidity', u.FillColumn(col=14, method='mean')),
    
    ('process-dividends', u.ProcessDividends(col=17)),
    
    ('process-prices', u.ProcessPrices(col=16)),
    
    ('process-equity', u.ProcessEquity(col=19)),
    
    ('process-vacancy', u.ProcessVacancy(col=20)),
    
    ('process-assets', u.ProcessAssets(col=13)),
    
    ('drop-columns', u.DropColumns(cols=["Taxa de Performance", "Ativos Atuais", "Cotações Históricas",
                                         "Dividendos Históricos", "Dividend Yield Histórico",
                                         "Valor Patrimonial Histórico", "Vacância Histórica"])),
    
    ('drop-rows', u.DropRows(rows=["Cotas Emitidas","Tipo de Gestão","Público Alvo", "Mandato",
                                   "Segmento","Patrimônio Líquido","Descrição", "Data de Constituição do Fundo",
                                   "Val. Patr. Desv. Pad. Rel.", "Vacância Desv. Pad. Rel."])),
    
    ('cat-encoding', u.OneHotEncoder(col=cat_columns, category_manager=cat_manager)),
    ('text-encoding', u.CountVectorizer(col=text_columns)),
    
    ('convert-available2float', u.Convert2Float())
]) 

In [5]:
funds_df = pipeline.fit_transform(funds_df_raw)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unable to convert column 'Ticker'
Unable to convert column 'Nome'
Unable to convert column 'Administrador'
Unable to convert column 'Descrição'
Unable to convert column 'Data de Constituição do Fundo'
Unable to convert column 'Tipo de Gestão'
Unable to convert column 'Público Alvo'
Unable to convert column 'Mandato'
Unable to convert column 'Segmento'
Unable to convert column 'Prazo de Duração'
Unable to convert column 'Taxa de Administração'


In [6]:
pd.set_option('display.max_columns', None)
funds_df.head(n=3)

Unnamed: 0,Ticker,Nome,Administrador,Descrição,Data de Constituição do Fundo,Cotas Emitidas,Tipo de Gestão,Público Alvo,Mandato,Segmento,Prazo de Duração,Taxa de Administração,Liquidez Diária,Patrimônio Líquido,Div. M-0,Div. M-1,Div. M-2,Div. M-3,Div. M-4,Div. M-5,Div. M-6,Div. M-7,Div. M-8,Div. M-9,Div. M-10,Div. M-11,Div. Acum. Últ. Trimestre,Div. Média,Div. Min,Div. Max,Div. Desv. Pad. Rel.,Div. Assimetria,Div. Curtose,Preços Média M-0,Preços Média M-1,Preços Média M-2,Preços Média M-3,Preços Média M-4,Preços Média M-5,Preços Média M-6,Preços Média M-7,Preços Média M-8,Preços Média M-9,Preços Média M-10,Preços Média M-11,Preços Média,Preços Min,Preços Max,Preços Desv. Pad. Rel.,Preços Assimetria,Preços Curtose,Preços Variação Total,Val. Patr. M-0,Val. Patr. M-1,Val. Patr. M-2,Val. Patr. M-3,Val. Patr. M-4,Val. Patr. M-5,Val. Patr. M-6,Val. Patr. M-7,Val. Patr. M-8,Val. Patr. M-9,Val. Patr. M-10,Val. Patr. M-11,Val. Patr. Média,Val. Patr. Min,Val. Patr. Max,Val. Patr. Desv. Pad. Rel.,Val. Patr. Assimetria,Val. Patr. Curtose,Va. Patr. Variação Total,Vacância M-0,Vacância M-1,Vacância M-2,Vacância M-3,Vacância M-4,Vacância M-5,Vacância M-6,Vacância M-7,Vacância M-8,Vacância M-9,Vacância M-10,Vacância M-11,Vacância Média,Vacância Min,Vacância Max,Vacância Desv. Pad. Rel.,Vacância Assimetria,Vacância Curtose,Área dos Ativos AC,Área dos Ativos AL,Área dos Ativos AP,Área dos Ativos AM,Área dos Ativos BA,Área dos Ativos CE,Área dos Ativos DF,Área dos Ativos ES,Área dos Ativos GO,Área dos Ativos MA,Área dos Ativos MT,Área dos Ativos MS,Área dos Ativos MG,Área dos Ativos PA,Área dos Ativos PB,Área dos Ativos PR,Área dos Ativos PE,Área dos Ativos PI,Área dos Ativos RJ,Área dos Ativos RN,Área dos Ativos RS,Área dos Ativos RO,Área dos Ativos RR,Área dos Ativos SC,Área dos Ativos SP,Área dos Ativos SE,Área dos Ativos TO
0,ABCP11,FDO INV IMOB GRAND PLAZA SHOPPING,"(0, 26)\t1.0\n (1, 4)\t1.0\n (2, 4)\t1.0\n...","(0, 0)\t1",2007-01-08,1000000.0,"(0, 1)\t1.0\n (1, 1)\t1.0\n (2, 0)\t1.0\n ...","(0, 3)\t1.0\n (1, 3)\t1.0\n (2, 3)\t1.0\n ...","(0, 3)\t1.0\n (1, 3)\t1.0\n (2, 3)\t1.0\n ...","(0, 7)\t1.0\n (1, 3)\t1.0\n (2, 4)\t1.0\n ...","(0, 1)\t1.0\n (1, 1)\t1.0\n (2, 1)\t1.0\n ...","(0, 2)\t1\n (0, 1)\t1\n (0, 0)\t1",3152.0,1000000000.0,0.0,0.0,0.49,0.83,0.49,0.49,0.49,0.49,0.49,0.45,0.45,0.45,0.49,0.296093,0.0,0.83,0.4031,0.848723,1.974819,81.089,84.7635,89.652273,107.0,107.905,91.620455,104.264091,107.596842,101.969545,105.812381,106.091053,96.069,61.500297,22.8,110.0,0.347212,0.702069,-0.759723,3.639912,85.639938,85.42911,85.027593,85.010016,85.317364,85.048048,74.628604,74.532345,74.55538,74.56569,74.285603,74.232626,64.558682,51.9,85.639938,0.171725,0.526526,-0.976952,1.650095,0.014,0.0,0.013,0.01,0.01,0.01,0.009,0.011,0.013,0.013,0.011,0.011,0.011583,0.009,0.014,0.149341,0.128655,-1.404736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69628.0,0.0,0.0
1,ALMI11,FDO INV IMOB - FII TORRE ALMIRANTE,"(0, 26)\t1.0\n (1, 4)\t1.0\n (2, 4)\t1.0\n...","(0, 0)\t1",2004-11-12,104700.0,"(0, 1)\t1.0\n (1, 1)\t1.0\n (2, 0)\t1.0\n ...","(0, 3)\t1.0\n (1, 3)\t1.0\n (2, 3)\t1.0\n ...","(0, 3)\t1.0\n (1, 3)\t1.0\n (2, 3)\t1.0\n ...","(0, 7)\t1.0\n (1, 3)\t1.0\n (2, 4)\t1.0\n ...","(0, 1)\t1.0\n (1, 1)\t1.0\n (2, 1)\t1.0\n ...","(0, 2)\t1\n (0, 1)\t1\n (0, 0)\t1",186.0,220000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.73907,0.0,85.717631,0.705546,2.711537,16.145885,1239.927,1343.8935,1615.951818,2010.349444,2042.954091,1451.683636,1426.359091,1402.484211,1399.163182,1411.615238,1445.675263,1490.339,2020.515499,1100.0,4030.0,0.343438,0.985789,0.021057,1.118182,1996.680047,1986.162618,1985.134078,1982.361029,1984.565191,1717.393662,1714.044626,1713.481249,1713.570527,1714.689076,1715.537833,1715.513565,2263.883237,1713.481249,3284.48,0.245868,0.791605,-1.053469,0.607914,0.7556,0.0,0.7556,0.7556,0.7856,0.82,0.82,0.82,0.82,0.82,0.82,0.82,0.795667,0.7556,0.82,0.039138,-0.495029,-1.653063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41468.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ALZR11,ALIANZA TRUST RENDA IMOBILIARIA FDO INV IMOB,"(0, 26)\t1.0\n (1, 4)\t1.0\n (2, 4)\t1.0\n...","(0, 0)\t1",2018-01-04,997042.0,"(0, 1)\t1.0\n (1, 1)\t1.0\n (2, 0)\t1.0\n ...","(0, 3)\t1.0\n (1, 3)\t1.0\n (2, 3)\t1.0\n ...","(0, 3)\t1.0\n (1, 3)\t1.0\n (2, 3)\t1.0\n ...","(0, 7)\t1.0\n (1, 3)\t1.0\n (2, 4)\t1.0\n ...","(0, 1)\t1.0\n (1, 1)\t1.0\n (2, 1)\t1.0\n ...","(0, 2)\t1\n (0, 1)\t1\n (0, 0)\t1",11570.0,270000000.0,0.596226,0.586388,0.586496,0.594587,0.567433,0.55926,0.547685,0.466299,0.482042,0.465255,0.438669,0.659525,1.76911,0.573835,0.0,0.78091,0.246215,-2.455367,7.571108,108.2845,106.9535,102.744545,126.201667,136.273636,105.477273,105.062273,105.944211,98.480455,96.623333,98.095263,98.1465,103.227184,83.51,148.99,0.108114,1.515555,2.461624,1.115857,94.652729,94.654281,94.655696,94.650318,94.645384,94.646935,94.648537,94.618676,94.65164,0.0,94.652244,93.287197,95.857256,93.287197,97.404583,0.013899,0.066718,-1.471773,0.986809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8178.0,0.0,0.0,0.0,0.0,0.0,44462.0,0.0,0.0


In [7]:
funds_df['Administrador'].astype(float)

ValueError: setting an array element with a sequence.

In [None]:
cat_manager.categories