# Importing modules

In [1]:
import numpy as np
import pandas as pd

# Data

In [None]:
colunas = ['ANO_CINEMATOGRAFICO', 'SEMANA_CINEMATOGRAFICA', 'TIPO_SESSAO',
       'REGISTRO_COMPLEXO', 'REGISTRO_GRUPO','REGISTRO_SALA', 'CPB_ROE', 'ASSENTOS_INFERIDO',
       'OCUPAÇÃO_SALA_INFERIDA', 'd_t', 'id_NAC', 'xt_comp', 't_comp',
       'OBG_FINAL_COMP', 'SALAS_COMP', 'DIA_abs', 'COMP_CUMPRIU', 'xt_frac',
       'cump_frac', 'cpb_id', 'beta', 'HORA_ajustada'] # all cols with defined dtypes (see below)

remover = {'CPB_ROE','TIPO_SESSAO','ANO_CINEMATOGRAFICO','d_t','beta',
          'OBG_FINAL_COMP','OCUPAÇÃO_SALA_INFERIDA','t_comp','cpb_id','COMP_CUMPRIU','cump_frac','xt_comp',
           'SEMANA_CINEMATOGRAFICA','REGISTRO_SALA','REGISTRO_GRUPO'} # cols to remove

importar = list(set(colunas).difference(remover)) # cols to import

In [None]:
painel = pd.read_csv('Painel 2018 final.csv', dtype={
    'ANO_CINEMATOGRAFICO':'int16', 'SEMANA_CINEMATOGRAFICA':'int8', 'REGISTRO_COMPLEXO':'uint16',
    'CPB_ROE':str, 'OCUPAÇÃO_SALA_INFERIDA':float, 'd_t':int, 'x_t':float,
    'id_NAC':bool, 'xt_comp':float, 't_comp':int, 'OBG_FINAL_COMP':float,
    'SALAS_COMP':'int8', 'DIA_abs':'int16', 'COMP_CUMPRIU':bool, 'cpb_id':'int16', 'cump_frac':float, 
    'xt_frac':float, 'ASSENTOS_INFERIDO':'int16', 'TIPO_SESSAO':str, 'beta':float, 'HORA_ajustada':'int8',
    'REGISTRO_GRUPO':int,'REGISTRO_SALA':'int16'},usecols=importar)

del colunas
del importar
del remover

In [None]:
print(painel.shape)

In [None]:
print(painel.columns)

In [None]:
print(painel.dtypes)

In [None]:
print(painel.info())

## Export as pickle

In [None]:
painel.to_pickle('Painel_2018_pickle')

# Creating design matrix

In [2]:
painel = pd.read_pickle('Painel_2018_pickle')

In [3]:
painel.columns

Index(['REGISTRO_COMPLEXO', 'ASSENTOS_INFERIDO', 'HORA_ajustada', 'id_NAC',
       'SALAS_COMP', 'DIA_abs', 'xt_frac'],
      dtype='object')

In [6]:
import psutil

print(psutil.virtual_memory().available / 1024 / 1024) # available virtual memory in MBs

4782.34375


In [7]:
painel = pd.get_dummies(painel, columns=['DIA_abs','REGISTRO_COMPLEXO','HORA_ajustada'],drop_first=True)

In [6]:
print(len(painel.columns))

1186


In [6]:
print(painel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4243342 entries, 0 to 4243341
Columns: 1186 entries, ASSENTOS_INFERIDO to HORA_ajustada_23
dtypes: bool(1), float64(1), int16(1), int8(1), uint8(1182)
memory usage: 4.7 GB
None


In [6]:
print(painel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4243342 entries, 0 to 4243341
Columns: 1218 entries, ASSENTOS_INFERIDO to xt:REGISTRO_COMPLEXO_2415
dtypes: bool(1), float64(33), int16(1), int8(1), uint8(1182)
memory usage: 5.7 GB
None


# Model fit

In [8]:
from sklearn import linear_model

In [9]:
cols = painel.columns
y = painel['id_NAC'].values
painel.drop('id_NAC',axis=1, inplace=True)

In [10]:
reg = linear_model.LogisticRegression(
    solver='lbfgs', max_iter=10000, fit_intercept=True).fit(
    painel,y) # model fit

MemoryError: Unable to allocate 37.5 GiB for an array with shape (1185, 4243342) and data type float64

In [None]:
import shelve

with shelve.open(r'bin_logit') as bn:
    bn['bin_logit'] = reg
    bn['X_col_names'] = cols

# Ignore

from patsy import dmatrices

y, X = dmatrices('id_NAC ~ C(DIA_abs) + C(REGISTRO_COMPLEXO)*xt_frac + C(HORA_ajustada) + ASSENTOS_INFERIDO + SALAS_COMP',
                data=painel, return_type='dataframe')

cols = painel.columns

for i, c in enumerate(cols[1150:]):
    print(i, c)
    if 'COMPLEXO' in c:
        painel.loc[painel[c] != 0, 'xt:'+c] = np.multiply(
            painel.loc[painel[c] != 0, 'xt_frac'], painel.loc[painel[c] != 0, c])
        painel.loc[painel[c] == 0, 'xt:'+c] = 0