# Importing modules

In [1]:
import numpy as np
import pandas as pd

# Data

colunas = ['ANO_CINEMATOGRAFICO', 'SEMANA_CINEMATOGRAFICA', 'TIPO_SESSAO',
       'REGISTRO_COMPLEXO', 'REGISTRO_GRUPO','REGISTRO_SALA', 'CPB_ROE', 'ASSENTOS_INFERIDO',
       'OCUPAÇÃO_SALA_INFERIDA', 'd_t', 'id_NAC', 'xt_comp', 't_comp',
       'OBG_FINAL_COMP', 'SALAS_COMP', 'DIA_abs', 'COMP_CUMPRIU', 'xt_frac',
       'cump_frac', 'cpb_id', 'beta', 'HORA_ajustada'] # all cols with defined dtypes (see below)

remover = {'CPB_ROE','TIPO_SESSAO','ANO_CINEMATOGRAFICO','d_t','beta',
          'OBG_FINAL_COMP','OCUPAÇÃO_SALA_INFERIDA','t_comp','cpb_id','COMP_CUMPRIU','cump_frac','xt_comp',
           'SEMANA_CINEMATOGRAFICA','REGISTRO_SALA','REGISTRO_GRUPO'} # cols to remove

importar = list(set(colunas).difference(remover)) # cols to import

painel = pd.read_csv('Painel 2018 final.csv', dtype={
    'ANO_CINEMATOGRAFICO':'int16', 'SEMANA_CINEMATOGRAFICA':'int8', 'REGISTRO_COMPLEXO':'uint16',
    'CPB_ROE':str, 'OCUPAÇÃO_SALA_INFERIDA':float, 'd_t':int, 'x_t':float,
    'id_NAC':bool, 'xt_comp':float, 't_comp':int, 'OBG_FINAL_COMP':float,
    'SALAS_COMP':'int8', 'DIA_abs':'int16', 'COMP_CUMPRIU':bool, 'cpb_id':'int16', 'cump_frac':float, 
    'xt_frac':float, 'ASSENTOS_INFERIDO':'int16', 'TIPO_SESSAO':str, 'beta':float, 'HORA_ajustada':'int8',
    'REGISTRO_GRUPO':int,'REGISTRO_SALA':'int16'},usecols=importar)

del colunas
del importar
del remover

print(painel.shape)

print(painel.columns)

print(painel.dtypes)

print(painel.info())

## Export as pickle

painel.to_pickle('Painel_2018_pickle')

# Creating design matrix

In [9]:
painel = pd.read_pickle('Painel_2018_pickle')

In [4]:
import psutil

print(psutil.virtual_memory().available / 1024 / 1024) # available v memory in MBs

4072.91015625


In [10]:
painel = pd.get_dummies(painel, columns=['DIA_abs','REGISTRO_COMPLEXO','HORA_ajustada'],drop_first=True)

In [6]:
print(painel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4243342 entries, 0 to 4243341
Columns: 1186 entries, ASSENTOS_INFERIDO to HORA_ajustada_23
dtypes: bool(1), float64(1), int16(1), int8(1), uint8(1182)
memory usage: 4.7 GB
None


In [11]:
for c in painel.columns:
    if 'COMPLEXO' in c:
        painel.loc[painel[c] != 0, 'xt:'+c] = np.multiply(
            painel.loc[painel[c] != 0, 'xt_frac'], painel.loc[painel[c] != 0, c])
        painel.loc[painel[c] == 0, 'xt:'+c] = 0

MemoryError: Unable to allocate 5.28 GiB for an array with shape (167, 4243342) and data type float64

In [16]:
print(painel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4243342 entries, 0 to 4243341
Columns: 1186 entries, ASSENTOS_INFERIDO to HORA_ajustada_23
dtypes: bool(1), float64(1), int16(1), int8(1), uint8(1182)
memory usage: 4.7 GB


# Model fit

In [2]:
from sklearn import linear_model

In [None]:
reg = linear_model.LogisticRegression(
    solver='lbfgs', max_iter=2000, fit_intercept=True).fit(
    painel.drop('id_NAC',axis=1),painel['id_NAC']) # model fit

In [None]:
import shelve

with shelve.open(r'bin_logit') as bn:
    bn['bin_logit'] = reg
    bn['X_col_names'] = painel.drop('id_NAC',axis=1).columns