In [113]:
%matplotlib inline

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

np.random.seed(2323)

DATA_DIR = './data/'

Two kinds of historical information are given: __sale log__ and __repair log__. The time period of the __sale log__ is from _January/2005_ to _February/2008_; while the time period of the __repair log__ is from _February/2005_ to _December/2009_. Details of these two files are described in the File description section.

Participants should exploit the sale and repair log to predict the the __monthly repair amount__ for each __module-component__ from _January/2010 to July/2011_. In other words, the model should output a series (nineteen elements, one element for one month) of predicted __real-value__ (amount of repair) for each module-component.

In [213]:
# load files
repair_train = pd.read_csv(os.path.join(DATA_DIR, 'RepairTrain.csv'), parse_dates=[2, 3])
sale_train = pd.read_csv(os.path.join(DATA_DIR, 'SaleTrain.csv'), parse_dates=[2])
output_mapping = pd.read_csv(os.path.join(DATA_DIR, 'Output_TargetID_Mapping.csv'))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, 'SampleSubmission.csv'))

In [81]:
repair_train.head()

Unnamed: 0,module_category,component_category,year/month(sale),year/month(repair),number_repair
0,M6,P16,2007-09-01,2009-04-01,1
1,M2,P30,2007-09-01,2009-08-01,1
2,M1,P12,2006-10-01,2008-02-01,2
3,M1,P30,2006-05-01,2007-07-01,1
4,M3,P06,2007-08-01,2007-12-01,1


In [82]:
sale_train.head()

Unnamed: 0,module_category,component_category,year/month,number_sale
0,M4,P27,2005-01-01,0
1,M4,P27,2005-05-01,1042
2,M4,P27,2005-09-01,1677
3,M4,P27,2005-10-01,918
4,M4,P27,2005-11-01,0


In [96]:
output_mapping.head()

Unnamed: 0,module_category,component_category,year,month
0,M1,P02,2010,1
1,M1,P02,2010,2
2,M1,P02,2010,3
3,M1,P02,2010,4
4,M1,P02,2010,5


** How many of the module and component category are in the training set as well ? **

In [60]:
def count_module_components_in_train(df, output_mapping):
    num_mod_comp = 0
    checked = {}
    
    output_mapping_without_duplicates = output_mapping[['module_category', 'component_category']].drop_duplicates()
    
    for mod, comp in zip(output_mapping_without_duplicates['module_category'], output_mapping_without_duplicates['component_category']):
        mask = (df.module_category == mod) & (df.component_category == comp)
        
        if (mod,comp) not in checked  and df.loc[mask].shape[0] > 0:
            num_mod_comp += 1
            checked[(mod, comp)] = True
    
    return num_mod_comp

print('Number of module and component in the repair_train ', count_module_components_in_train(repair_train, output_mapping))
print('Number of module and component in the sale_train ', count_module_components_in_train(sale_train, output_mapping))

Number of module and component in the repair_train  224
Numbe of module and component in the sale_train  224


In [84]:
# add month of the repair as a feature
repair_train['repair_month'] = repair_train['year/month(repair)'].dt.month

** So all of the module and component pairs are present in the sales and repair dataset. **

In [186]:
repair_per_month = repair_train.pivot_table(values='number_repair', index=['module_category', 'component_category'],\
                         columns=['repair_month'], fill_value=0, aggfunc='sum')

### Exponential Decay

For every module and component take numbe of repairs in the month of December to be initial value and then at a decay rate $k$, give predictions for the rest 19 months.

In [187]:
module_component_unique_pairs = output_mapping[['module_category', 'component_category']].drop_duplicates()
needs_adjustment = 0

affected_modules = []
affected_components = []

unique_pairs = zip(module_component_unique_pairs['module_category'], module_component_unique_pairs['component_category'])

for mod, comp in unique_pairs:
    if repair_per_month.ix[(mod, comp)][12] == 0:
        needs_adjustment += 1
        
        affected_modules.append(mod)
        affected_components.append(comp)

print('Number of pairs that need adjustment ', needs_adjustment)
print('\nAffected modules ', affected_modules)
print('\nAffected Components ', affected_components)

Number of pairs that need adjustment  29

Affected modules  ['M1', 'M1', 'M2', 'M2', 'M2', 'M2', 'M2', 'M2', 'M3', 'M3', 'M3', 'M3', 'M3', 'M4', 'M4', 'M5', 'M5', 'M5', 'M6', 'M6', 'M6', 'M6', 'M6', 'M7', 'M8', 'M8', 'M8', 'M9', 'M9']

Affected Components  ['P25', 'P27', 'P07', 'P08', 'P10', 'P14', 'P27', 'P29', 'P01', 'P10', 'P18', 'P23', 'P27', 'P01', 'P27', 'P01', 'P10', 'P27', 'P01', 'P03', 'P10', 'P11', 'P27', 'P27', 'P01', 'P10', 'P27', 'P01', 'P27']


In [202]:
def get_initial_value(mod, comp, repair_per_month):
    repair_by_months = repair_per_month.ix[(mod, comp)]
    
    if repair_by_months[12] == 0:
        non_zero_elements = np.nonzero(repair_by_months)[0]
        
        if len(non_zero_elements) == 0:
            return 0
        else:
            return repair_by_months[non_zero_elements.max() + 1]
    else:
        return repair_by_months[12]

def exponential_decay(decay_rate, modules, components, repair_per_month):
    prediction_dict = defaultdict(list)
    
    for mod, comp in zip(modules, components):
        N0 = get_initial_value(mod, comp, repair_per_month)
        
        for i in range(19):
            Ni = N0 * np.exp(-decay_rate * (i+1))
            prediction_dict[(mod, comp)].append(Ni)
    
    return prediction_dict

In [220]:
prediction_dict = exponential_decay(0.91, module_component_unique_pairs['module_category'],\
                                    module_component_unique_pairs['component_category'],repair_per_month)

### Submissions

In [223]:
output_mapping['predictions'] = np.ones(len(output_mapping))

In [224]:
def prepare_submission(modules, components, output_mapping):
    for mod, comp in zip(modules, components):
        mask = (output_mapping.module_category == mod) & (output_mapping.component_category == comp)
        output_mapping.loc[mask, 'predictions'] = prediction_dict[(mod, comp)]
    
    return output_mapping

In [225]:
output_mapping = prepare_submission(module_component_unique_pairs['module_category'],\
                                    module_component_unique_pairs['component_category'],
                                    output_mapping)

In [226]:
sample_sub['target'] = output_mapping.predictions
sample_sub.to_csv('./submissions/exponential_decay.csv', index=False)