In [1]:
# import useful stuff
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as Tree
import re

# import custom functions
from metrics import *

# avoid undefined metric warning when calculating precision with 0 labels defined as 1
import warnings
warnings.filterwarnings('ignore')

### Data transformations (from data analysis)

In [2]:
def transform(df, fillna=False):
    # remove columns
    for col in ['ult_fec_cli_1t', 'conyuemp', 'tipodom', 'cod_prov', 
                'pais_residencia', 'ncodpers', 'indrel', 'indrel_1mes', 
                'ind_empleado', 'fecha_alta', 'fecha_dato']:
        del df[col]

    # convert numerical vars to int
    numerical_vars = ['age', 'antiguedad', 'renta']
    df[numerical_vars] = df[numerical_vars].convert_objects(convert_numeric=True)

    # convert S/N to boolean
    for var in ['indfall', 'indresi', 'indext']:
        df[var] = df[var] == 'S'

    # drop na
    if fillna:
        df = df.fillna(value=0)
    else:
        df = df.dropna()    
        
    # one hot encode remaining categorical vars
    categorical_vars = ['segmento', 'sexo', 'tiprel_1mes', 'canal_entrada', 'nomprov']
    df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, 
                       columns=categorical_vars, sparse=False, drop_first=False)
    
    # remove variables with one value, if any
    for col in df.columns:
        if len(df[col].value_counts()) == 1:
            print(col)
            del df[col]
            
    return df

In [3]:
# df_train = pd.read_csv('train_ver2.csv', nrows=100000)
df_train = pd.read_csv('train_ver2.csv', nrows=3000000)

In [4]:
df_train = transform(df_train)

In [5]:
# separate the labels
labels = []
for col in df_train.columns:
    if col[:4] == 'ind_' and col[-4:] == 'ult1':
        labels.append(col)

# create X and y delete dataframe
X = df_train[df_train.columns.difference(labels)]
y = df_train[labels]
del df_train

In [6]:
# order labels before running classifier
ordered_labels = []
for label in labels:
    ordered_labels.append((label, (y[label] == 1).sum()))

labels = [x for (x,y) in sorted(ordered_labels, key=lambda x:-x[1])]

### First Shot at Prediction

After all required corvertions have been made, I can make a first shot at predicting. First question we need to ask is, what I'm a predicting?

I'm predicting comsuption of a certain product. I have a total of 24 booleans that will tell whether or not this customer consumed this product. These are my labels for a One vs All classification model.



In [7]:
# upload test data
X_test = pd.read_csv('test_ver2.csv')

# initialize results
report = pd.DataFrame(X_test['ncodpers'])
classif_results = {}

# prepare test data for classifer
X_test = transform(X_test, fillna=True)


In [None]:
# X_test should only have columns that are also in X (needed due to one-hot encoding)
paired_columns = [col for col in X_test.columns if col in X.columns]
X_test = X_test[paired_columns]

In [None]:
# predict each product with a different clssifer
for label in labels:
   if len(y[label].value_counts()) != 1:
       clf = Tree()
       clf.fit(X, y[label])
       classif_results[label] = clf.predict(X_test)

In [None]:
# clean memory
del X
del y
del X_test

In [None]:
# transform results to expected output
fn_name_labels = lambda label, pred: list(map(lambda x: label if x else '', pred))
cf_list = [fn_name_labels(k,v) for k,v in classif_results.items()]

# concatenate results
fn_join_columns = lambda x:re.sub('\s+', ' ', ' '.join(x)).strip()

# add new column added products in report
report['added_products'] = list(map(fn_join_columns, zip(*cf_list)))

In [None]:
report.ix[0, 'added_products']

In [None]:
report.to_csv('round1b.csv', header=True, index=False)