In [268]:
%matplotlib inline

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')

from collections import defaultdict
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

np.random.seed(2323)

DATA_DIR = './data/'

Two kinds of historical information are given: __sale log__ and __repair log__. The time period of the __sale log__ is from _January/2005_ to _February/2008_; while the time period of the __repair log__ is from _February/2005_ to _December/2009_. Details of these two files are described in the File description section.

Participants should exploit the sale and repair log to predict the the __monthly repair amount__ for each __module-component__ from _January/2010 to July/2011_. In other words, the model should output a series (nineteen elements, one element for one month) of predicted __real-value__ (amount of repair) for each module-component.

In [281]:
# load files
repair_train = pd.read_csv(os.path.join(DATA_DIR, 'RepairTrain.csv'), parse_dates=[2, 3])
sale_train = pd.read_csv(os.path.join(DATA_DIR, 'SaleTrain.csv'), parse_dates=[2])
output_mapping = pd.read_csv(os.path.join(DATA_DIR, 'Output_TargetID_Mapping.csv'))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, 'SampleSubmission.csv'))

In [243]:
repair_train.head()

Unnamed: 0,module_category,component_category,year/month(sale),year/month(repair),number_repair
0,M6,P16,2007-09-01,2009-04-01,1
1,M2,P30,2007-09-01,2009-08-01,1
2,M1,P12,2006-10-01,2008-02-01,2
3,M1,P30,2006-05-01,2007-07-01,1
4,M3,P06,2007-08-01,2007-12-01,1


In [244]:
sale_train.head()

Unnamed: 0,module_category,component_category,year/month,number_sale
0,M4,P27,2005-01-01,0
1,M4,P27,2005-05-01,1042
2,M4,P27,2005-09-01,1677
3,M4,P27,2005-10-01,918
4,M4,P27,2005-11-01,0


In [245]:
output_mapping.head()

Unnamed: 0,module_category,component_category,year,month
0,M1,P02,2010,1
1,M1,P02,2010,2
2,M1,P02,2010,3
3,M1,P02,2010,4
4,M1,P02,2010,5


** How many of the module and component category are in the training set as well ? **

In [246]:
def count_module_components_in_train(df, output_mapping):
    num_mod_comp = 0
    checked = {}
    
    output_mapping_without_duplicates = output_mapping[['module_category', 'component_category']].drop_duplicates()
    
    for mod, comp in zip(output_mapping_without_duplicates['module_category'], output_mapping_without_duplicates['component_category']):
        mask = (df.module_category == mod) & (df.component_category == comp)
        
        if (mod,comp) not in checked  and df.loc[mask].shape[0] > 0:
            num_mod_comp += 1
            checked[(mod, comp)] = True
    
    return num_mod_comp

print('Number of module and component in the repair_train ', count_module_components_in_train(repair_train, output_mapping))
print('Number of module and component in the sale_train ', count_module_components_in_train(sale_train, output_mapping))

Number of module and component in the repair_train  224
Number of module and component in the sale_train  224


** So all of the module and component pairs are present in the sales and repair dataset. **

In [282]:
repair_per_month = repair_train.pivot_table(values='number_repair', index=['module_category', 'component_category'],\
                         columns=['year/month(repair)'], fill_value=0, aggfunc='sum')

In [283]:
def decrease_in_last_6_months(pair):
    module, component = pair
    num_decrease = 0
    
    last_6_month = repair_per_month.ix[(module, component)][-6:]
    
    for i in range(5):
        if last_6_month.iloc[i] > last_6_month.iloc[i+1]:
            num_decrease += 1
    
    return num_decrease

repair_per_month['num_decrease'] = list(map(decrease_in_last_6_months, repair_per_month.index.values))

In [284]:
def log_value(module, component, key):
    key = pd.to_datetime(key)
    return np.log(1 + repair_per_month.ix[(module, component)][key])

years = ['2009/05', '2009/06', '2009/07', '2009/08', '2009/09', '2009/10', '2009/11', '2009/12']

repair_per_month['log_1_month'] = [log_value(mod, comp, years[0]) for mod, comp in repair_per_month.index.values]
repair_per_month['log_2_month'] = [log_value(mod, comp, years[1]) for mod, comp in repair_per_month.index.values]
repair_per_month['log_3_month'] = [log_value(mod, comp, years[2]) for mod, comp in repair_per_month.index.values]
repair_per_month['log_4_month'] = [log_value(mod, comp, years[3]) for mod, comp in repair_per_month.index.values]
repair_per_month['log_5_month'] = [log_value(mod, comp, years[4]) for mod, comp in repair_per_month.index.values]
repair_per_month['log_6_month'] = [log_value(mod, comp, years[5]) for mod, comp in repair_per_month.index.values]
repair_per_month['log_7_month'] = [log_value(mod, comp, years[6]) for mod, comp in repair_per_month.index.values]

In [286]:
repair_per_month['log_1_month'].head(1)

module_category  component_category
M1               P02                   0.693147
Name: log_1_month, dtype: float64

In [293]:
def linear_coefficiect(row):
    y = np.hstack((row['log_1_month'], row['log_2_month'], row['log_3_month'],
                       row['log_4_month'], row['log_5_month'], row['log_6_month'],
                       row['log_7_month']
                      )
                  )
    
    x = np.arange(0, 7)
    
    z = np.polyfit(x, y, 1) # linear estimation
    
    intercept = z[0]
    
    if intercept >= 0:
        return np.log(0.91)
    else:
        return intercept

repair_per_month['linear_estimation'] = repair_per_month[['log_1_month', 'log_2_month', 'log_3_month', \
                                                          'log_4_month', 'log_5_month','log_6_month', 'log_7_month'\
                                                         ]].apply(linear_coefficiect, axis=1)

In [294]:
repair_per_month['decay_coefficient'] = np.exp(repair_per_month.linear_estimation)
repair_per_month['decay_coefficient_processed'] = repair_per_month.decay_coefficient + \
                                                  (1 - repair_per_month.decay_coefficient) / 2

In [296]:
repair_per_month['decay_coefficient_processed'].head(2)

module_category  component_category
M1               P02                   0.964212
                 P04                   0.975848
Name: decay_coefficient_processed, dtype: float64

Extrapolate to the future 19 months by using the decay parameter per row and initialize the first element based on the number of decreases. If number of decreases greater than 3 then initialize with the number of repairs in the last month i.e. 2009/12 and multiply with decay rate else take average of last 3 months repair values and then multiply with the decay rate and take that as the initial value.

In [307]:
repair_per_month.ix[('M1', 'P02')].num_decrease

0.0

In [316]:
def create_predictions(index):
    prediction_dict = defaultdict(list)
    
    for i in range(len(index)):
        mod, comp = index[i]
        
        row = repair_per_month.ix[index[i]]
        decay_coefficient = row['decay_coefficient_processed']
        
        if row.num_decrease > 3:
            prediction_dict[(mod, comp)].append(row[pd.to_datetime('2009/12')] * decay_coefficient)
            
        else:
            average_ = (row[pd.to_datetime('2009/10')] + row[pd.to_datetime('2009/11')] \
                       + row[pd.to_datetime('2009/12')]) / 3.
            
            prediction_dict[(mod, comp)].append(average_  * decay_coefficient)
        
        for j in range(1, 19):
            prediction_dict[(mod, comp)].append(prediction_dict[(mod, comp)][j-1] * decay_coefficient)
    
    return prediction_dict

prediction_dict = create_predictions(repair_per_month.index.values)

### Submissions

In [317]:
output_mapping['predictions'] = np.ones(len(output_mapping))

In [318]:
def prepare_submission(modules, components, output_mapping):
    for mod, comp in zip(modules, components):
        mask = (output_mapping.module_category == mod) & (output_mapping.component_category == comp)
        output_mapping.loc[mask, 'predictions'] = prediction_dict[(mod, comp)]
    
    return output_mapping

In [319]:
output_mapping = prepare_submission(module_component_unique_pairs['module_category'],\
                                    module_component_unique_pairs['component_category'],
                                    output_mapping)

In [325]:
sample_sub['target'] = output_mapping.predictions
sample_sub.to_csv('./submissions/ryan_locar_solution.csv', index=False)