In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

np.random.seed(2313)
DATA_DIR = './data'

In [3]:
# load files
repair_train = pd.read_csv(os.path.join(DATA_DIR, 'RepairTrain.csv'), parse_dates=[2, 3])
sale_train = pd.read_csv(os.path.join(DATA_DIR, 'SaleTrain.csv'), parse_dates=[2])
output_mapping = pd.read_csv(os.path.join(DATA_DIR, 'Output_TargetID_Mapping.csv'))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, 'SampleSubmission.csv'))

In [4]:
saleyear = sale_train['year/month'].dt.year
salemonth = sale_train['year/month'].dt.month
saletime = (saleyear - 2005) * 12 + (salemonth - 1)

In [5]:
repairyearsale = repair_train['year/month(sale)'].dt.year
repairmonthsale = repair_train['year/month(sale)'].dt.month
repairtimesale = (repairyearsale - 2005) * 12 + (repairmonthsale - 1)

In [6]:
repairyearrepair = repair_train['year/month(repair)'].dt.year
repairmonthrepair = repair_train['year/month(repair)'].dt.month
repairtimerepair = (repairyearrepair - 2005) * 12 + (repairmonthrepair - 1)

In [8]:
# processed dataset
sale_train_processed = pd.DataFrame({
                            'mc': sale_train.module_category,
                            'cc': sale_train.component_category,
                            'time_sale': saletime,
                            'number_sale': sale_train.number_sale
                        })

sale_train_processed = sale_train_processed.loc[sale_train_processed.mc != 'M0'] # no need to include this module as this not in the test set

repair_train_processed = pd.DataFrame({
                            'mc': repair_train.module_category,
                            'cc': repair_train.component_category,
                            'time_sale': repairtimesale,
                            'timerepair': repairtimerepair,
                            'timedelta': (repairtimerepair - repairtimesale),
                            'number_repair': repair_train.number_repair
                        })

In [19]:
# group by module, component category and time_sale, time_repair, time_delta to summarize values

sale_train_processed['mc_cc_time'] = sale_train_processed.groupby(['mc', 'cc', 'time_sale'])['number_sale'].transform(lambda x: sum(x))
repair_train_processed['mc_cc_time_delta'] = repair_train_processed.groupby(['mc', 'cc', 'timedelta'])['number_repair'].transform(lambda x: sum(x))
repair_train_processed['mc_cc_time_repair'] = repair_train_processed.groupby(['mc', 'cc', 'timerepair'])['number_repair'].transform(lambda x: sum(x))

In [20]:
# aggregate across modules

sale_train_processed['cc_time'] = sale_train_processed.groupby(['cc', 'time_sale'])['number_sale'].transform(lambda x: sum(x))
repair_train_processed['cc_time_delta'] = repair_train_processed.groupby(['cc', 'timedelta'])['number_repair'].transform(lambda x: sum(x))

In [21]:
sale_train_processed.head()

Unnamed: 0,cc,mc,number_sale,time_sale,mc_cc_time,cc_time
0,P27,M4,0,0,9080,9080
1,P27,M4,1042,4,50718,50718
2,P27,M4,1677,8,46076,124524
3,P27,M4,918,9,43228,198661
4,P27,M4,0,10,33387,276055


In [32]:
def func_(df):
    recent_observations = df.loc[df['timerepair'] > 57]
    
    if sum(recent_observations['number_repair']) > 0:
        return sum(recent_observations['number_repair'])
    else:
        return 0
        
repairs = repair_train_processed.groupby(['mc', 'cc']).apply(func_)

repairs_df = pd.DataFrame({'last_2': repairs})
repairs_df = repairs_df.sort_values(by='last_2', ascending=False)

repairs_df['pctvar'] = np.round(repairs_df.last_2 / np.sum(repairs_df.last_2) * 100) / 100
repairs_df['cumvar'] = np.cumsum(repairs_df['pctvar'])

In [42]:
sale_train_processed.columns

Index(['cc', 'mc', 'number_sale', 'time_sale', 'mc_cc_time', 'cc_time'], dtype='object')

In [50]:
# calculate failure model using component filter as baseline
# each entry in filterlist gives the module category, component category, filter coefficients and repair estimate

filterlist = []
index = repairs_df.index.values

for i in range(len(repairs_df)):
    mc, cc = index[i]
    
    # generate filter coefficients
    # tmp contains aggregated sales data for a given component across all modules
    tmp = sale_train_processed.loc[sale_train_processed.cc == cc]
    
    # figure out "denominator" for filter
    opsforrepair = np.zeros((59 - tmp['time_sale'].values[0]))
    
    for l in range(len(tmp)):
        myl = 59 - tmp['time_sale'].values[l]
        
        print('myl ', myl)
        print('ops for repair ', len(opsforrepair))
        
        opsforrepair[0:myl] = opsforrepair[0:myl] + np.tile(tmp.iloc[l]['number_sale'], myl)
    
    mask = (repair_train_processed.cc == cc) & (repair_train_processed.timedelta > 0)
    nreps = repair_train_processed.loc[mask, 'number_repair'][:len(opsforrepair)] / opsforrepair
    
    nreps = nreps[0: len(opsforrepair)]
    nreps[nreps.isnull()] = 0
    
    nsales = np.zeros((60))
    
    for l in range(len(tmp)):
        nsales[tmp['time_sale'][l]] = tmp['number_sale'][l]
        
    filtercoef = nreps

myl  29
ops for repair  29
myl  27
ops for repair  29
myl  54
ops for repair  29


ValueError: operands could not be broadcast together with shapes (29,) (54,) 