In [4]:
import numpy as np
import pandas as pd

from sklearn import linear_model

## Importing data

In [5]:
colunas = ['ANO_CINEMATOGRAFICO', 'SEMANA_CINEMATOGRAFICA', 'TIPO_SESSAO',
       'REGISTRO_COMPLEXO', 'CPB_ROE', 'ASSENTOS_INFERIDO',
       'OCUPAÇÃO_SALA_INFERIDA', 'd_t', 'id_NAC', 'xt_comp', 't_comp',
       'OBG_FINAL_COMP', 'SALAS_COMP', 'DIA_abs', 'COMP_CUMPRIU', 'xt_frac',
       'cump_frac', 'cpb_id', 'beta'] # all cols with defined dtypes (see below)

remover = {'CPB_ROE','ASSENTOS_INFERIDO','TIPO_SESSAO','ANO_CINEMATOGRAFICO','d_t'} # cols to remove

importar = list(set(colunas).difference(remover)) # cols to import

In [6]:
painel = pd.read_csv('Painel 2018 final.csv', dtype={
    'ANO_CINEMATOGRAFICO':int, 'SEMANA_CINEMATOGRAFICA':int, 'REGISTRO_COMPLEXO':int,
    'CPB_ROE':str, 'OCUPAÇÃO_SALA_INFERIDA':float, 'd_t':int, 'x_t':float,
    'id_NAC':int, 'xt_comp':float, 't_comp':int, 'OBG_FINAL_COMP':float,
    'SALAS_COMP':float, 'DIA_abs':int, 'COMP_CUMPRIU':bool, 'cpb_id':int, 'cump_frac':float, 
    'xt_frac':float, 'ASSENTOS_INFERIDO':int, 'TIPO_SESSAO':str, 'beta':float}, usecols=importar)

## Defining regressors

In [7]:
reg_cols = ['xt_frac','DIA_abs', 'REGISTRO_COMPLEXO'] # defining regressors vector
# xt_frac is the proportion of quota fulfillment up to time t
# DIA_abs is the absolute day of the year (1,365)
# REGISTRO_COMPLEXO is movie theater id

dummies = list(set(reg_cols).difference({'xt_frac','ASSENTOS_INFERIDO'})) # getting categorical variables

### One week only test

In [27]:
week = 52 # defining week

y = painel.query("SEMANA_CINEMATOGRAFICA == @week")['cpb_id'] # target is movie id, filtered from panel/week
X = painel.query("SEMANA_CINEMATOGRAFICA == @week")[reg_cols] # regressors defined above, filtered from panel/week

In [28]:
X = pd.get_dummies(
    X, columns=dummies, drop_first=True) # getting dummies for categorical variables

In [29]:
for col in X.iloc[:,8:].columns:
    X[f'{col}:xt_frac'] = X[col]*X['xt_frac'] # creating interaction between dummy vars and xt_frac

In [10]:
reg = linear_model.LogisticRegression(multi_class='multinomial', solver='sag', max_iter=500).fit(X.values,y.values) # model fit

### Full panel regressions

#### First pass

In [10]:
from joblib import Parallel, delayed # parallel processing with joblib

import time 

simultaneous = 4

r = 53 // simultaneous # defining range to loop over, each loop will process @simultaneous regressions at a time

regs = {} # dict to store fitted models
cols = {} # dict to store col names for each fitted model

for n in range(r):
    print(f'Iteration {n} started at {time.asctime(time.localtime())}') # time for iteration start
    ar = {} # dict to store y (targets) and X (regressors) for each week of present iteration
    
    if (n*simultaneous + simultaneous) < 53: # guaranteeing number does not go over the number of weeks
        i = simultaneous*n
        sem_list = [i + j for j in range(simultanous)] # creating week list for present iteration
        for sem in sem_list: # looping over weeks of present iteration
            ar[f'y_{sem}'] = painel.query("SEMANA_CINEMATOGRAFICA == @sem")['cpb_id'].values # get and store targets for each week
            X = painel.query("SEMANA_CINEMATOGRAFICA == @sem")[reg_cols] # getting regressors
            num = X.DIA_abs.nunique() # no. of days of week (first and last week may be truncated)
            X = pd.get_dummies(X, columns=dummies, drop_first=True) # getting dummies for categorical regressors
    
            for col in X.iloc[:,num:].columns: # creating interactions
                X[f'{col}:xt_frac'] = X[col]*X['xt_frac']
            cols[f'semana_{sem}'] = X.columns.values # storing col names
            ar[f'X_{sem}'] = X.values # storing X (regressors for present week)
        
        # model fit for each week with multiprocessing, note that we start with only 100 max iters for efficiency reasons
        regs[f'reg_{sem_list[0]}'], regs[f'reg_{sem_list[1]}'], regs[f'reg_{sem_list[2]}'], regs[f'reg_{sem_list[3]}'] = Parallel(
            n_jobs = 4, backend='multiprocessing')(delayed(linear_model.LogisticRegression(
        multi_class='multinomial', solver='sag', max_iter=100).fit)(ar[f'X_{s}'], ar[f'y_{s}']) for s in sem_list)
        
    else: # this else condition was adjusted for simultaneous = 4, if it is different, adjusts will be needed
        sem = 52
        y = painel.query("SEMANA_CINEMATOGRAFICA == @sem")['cpb_id'].values
        X = painel.query("SEMANA_CINEMATOGRAFICA == @sem")[reg_cols]
        num = X.DIA_abs.nunique()
        X = pd.get_dummies(X, columns=dummies, drop_first=True)

        for col in X.iloc[:,num:].columns:
            X[f'{col}:xt_frac'] = X[col]*X['xt_frac']
        cols[f'semana_{sem}'] = X.columns.values
        
        regs[f'reg_{sem}'] = linear_model.LogisticRegression(multi_class='multinomial', solver='sag', max_iter=100).fit(X.values,y)

Iteration 0 started at Wed Mar 10 22:59:48 2021
Iteration 1 started at Wed Mar 10 23:29:05 2021
Iteration 2 started at Thu Mar 11 00:06:08 2021
Iteration 3 started at Thu Mar 11 00:29:41 2021
Iteration 4 started at Thu Mar 11 01:23:09 2021
Iteration 5 started at Thu Mar 11 02:59:13 2021
Iteration 6 started at Thu Mar 11 05:06:33 2021
Iteration 7 started at Thu Mar 11 06:56:20 2021
Iteration 8 started at Thu Mar 11 08:25:12 2021
Iteration 9 started at Thu Mar 11 10:18:49 2021
Iteration 10 started at Thu Mar 11 13:15:13 2021
Iteration 11 started at Thu Mar 11 15:56:39 2021
Iteration 12 started at Thu Mar 11 20:00:55 2021
Iteration 13 started at Thu Mar 11 22:13:37 2021




#### Second pass

In [16]:
non_conv = [] # list to store models that didn't converge with 100 iters

for k, v in regs.items(): # getting from dict
    if v.n_iter_ == 100:
        try:
            non_conv.append(int(k[-2:])) # for weeks with two digits
        except:
            non_conv.append(int(k[-1])) # weeks with only one digit

In [27]:
# creating tuples from @non-conv list for loop

b = 0 # start value
regression = [] # list of tuples
sim2 = 7 # no. of simultaneous processes in the second pass

while True:
    init = b # start slice index
    end = init+sim2 # finish slice index
    if end < len(non_conv)-1: # if slice index does not go over list max index
        regression.append(tuple(non_conv[b:end])) # append tuple slice
        b += sim2
    else: # if slice goes over list index
        regression.append(tuple(non_conv[b:]))
        break

In [29]:
len(regression) # checking no of tuples

4

In [31]:
count = 1

# for details see first pass

for tup in regression: # this is slighly modified from first pass, now we get tuples directly because weeks are now selected
    print(f'Iteration {count} started at {time.asctime(time.localtime())}') # time iter start
    count += 1
    ar = {}
    
    for t in tup:
        ar[f'y_{t}'] = painel.query("SEMANA_CINEMATOGRAFICA == @t")['cpb_id'].values
        X = painel.query("SEMANA_CINEMATOGRAFICA == @t")[reg_cols]
        num = X.DIA_abs.nunique()
        X = pd.get_dummies(X, columns=dummies, drop_first=True)

        for col in X.iloc[:,num:].columns: # note that we no longer store col names in cols dict
            X[f'{col}:xt_frac'] = X[col]*X['xt_frac']
        ar[f'X_{t}'] = X.values
    
    if len(tup) == sim2:
        regs[f'reg_{tup[0]}'], regs[f'reg_{tup[1]}'], regs[f'reg_{tup[2]}'], regs[f'reg_{tup[3]}'], regs[f'reg_{tup[4]}'], regs[f'reg_{tup[5]}'], regs[f'reg_{tup[6]}'] = Parallel(
            n_jobs = 5, backend='multiprocessing')(delayed(linear_model.LogisticRegression(
        multi_class='multinomial', solver='sag', max_iter=500).fit)(ar[f'X_{s}'], ar[f'y_{s}']) for s in tup)
        
    if len(tup) == 4: # this was adjusted for a 4 left-over tuple, adjustments are required according to @sim2 value
        regs[f'reg_{tup[0]}'], regs[f'reg_{tup[1]}'], regs[f'reg_{tup[2]}'], regs[f'reg_{tup[3]}'] = Parallel(
            n_jobs = 4, backend='multiprocessing')(delayed(linear_model.LogisticRegression(
        multi_class='multinomial', solver='sag', max_iter=500).fit)(ar[f'X_{s}'], ar[f'y_{s}']) for s in tup)

Iteration 1 started at Fri Mar 12 14:12:18 2021
Iteration 2 started at Fri Mar 12 18:38:03 2021
Iteration 3 started at Sat Mar 13 01:58:50 2021
Iteration 4 started at Sat Mar 13 15:10:46 2021


#### Storing results

In [34]:
import shelve

with shelve.open(r'bbl.out') as ws:
    ws['logits_regs'] = regs # storing regs
    ws['logits_cols'] = cols # storing cols

## Old/unused snippets of code

In [43]:
# this code gets all col names directly

cols = {}

for n in range(53):
    X = painel.query("SEMANA_CINEMATOGRAFICA == @n")[reg_cols]
    num = X.DIA_abs.nunique()
    X = pd.get_dummies(X, columns=dummies, drop_first=True)

    for col in X.iloc[:,num:].columns:
        X[f'{col}:xt_frac'] = X[col]*X['xt_frac']
    cols[f'semana_{n}'] = X.columns