In [16]:
# import useful stuff
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import re

# import cv functions
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# import custom functions
from metrics import *


# avoid undefined metric warning when calculating precision with 0 labels defined as 1
import warnings
warnings.filterwarnings('ignore')

### Start from scratch

In [71]:
df_train = pd.read_csv('train_ver2.csv', nrows=500000)

In [72]:
labels = []
for col in df_train.columns:
    if col[:4] == 'ind_' and col[-4:] == 'ult1':
        labels.append(col)

In [78]:
# split between X and y
X = df_train[df_train.columns - labels - ['ncodpers', 'fecha_alta']]
y = df_train[labels]

In [74]:
# remove labels underrepresented # why do I need to?
for label in labels:
    if y[label].value_counts()[0] > (500000-5000):
        del y[label]

In [79]:
# drop features. convert others.
X.head()

Unnamed: 0,age,antiguedad,canal_entrada,cod_prov,conyuemp,fecha_dato,ind_actividad_cliente,ind_empleado,ind_nuevo,indext,...,indrel_1mes,indresi,nomprov,pais_residencia,renta,segmento,sexo,tipodom,tiprel_1mes,ult_fec_cli_1t
0,35,6,KHL,29.0,,2015-01-28,1.0,N,0.0,N,...,1.0,S,MALAGA,ES,87218.1,02 - PARTICULARES,H,1.0,A,
1,23,35,KHE,13.0,,2015-01-28,0.0,N,0.0,S,...,1.0,S,CIUDAD REAL,ES,35548.74,03 - UNIVERSITARIO,V,1.0,I,
2,23,35,KHE,13.0,,2015-01-28,0.0,N,0.0,N,...,1.0,S,CIUDAD REAL,ES,122179.11,03 - UNIVERSITARIO,V,1.0,I,
3,22,35,KHD,50.0,,2015-01-28,0.0,N,0.0,N,...,1.0,S,ZARAGOZA,ES,119775.54,03 - UNIVERSITARIO,H,1.0,I,
4,23,35,KHE,50.0,,2015-01-28,1.0,N,0.0,N,...,1.0,S,ZARAGOZA,ES,,03 - UNIVERSITARIO,V,1.0,A,


In [69]:
# run a decision tree straight ahead, no transformatiosn
clf = DecisionTreeClassifier()
cv = StratifiedShuffleSplit(n_splits=3)
scores = cross_val_score(clf, X, y, cv=cv)
print(score.mean(), score.std())

ValueError: could not convert string to float: '2015-07-22'

### Data transformations (from data analysis)

In [111]:
def transform(df, fillna=False):
    # remove columns
    for col in ['ult_fec_cli_1t', 'conyuemp', 'tipodom', 'cod_prov', 
                'pais_residencia', 'ncodpers', 'indrel', 'indrel_1mes', 
                'ind_empleado', 'fecha_alta', 'fecha_dato']:
        del df[col]

    # convert numerical vars to int
    numerical_vars = ['age', 'antiguedad', 'renta']
    df[numerical_vars] = df[numerical_vars].convert_objects(convert_numeric=True)

    # convert S/N to boolean
    for var in ['indfall', 'indresi', 'indext']:
        df[var] = df[var] == 'S'

    # drop na
    if fillna:
        df = df.fillna(value=0)
    else:
        df = df.dropna()    
        
    # one hot encode remaining categorical vars
    categorical_vars = ['segmento', 'sexo', 'tiprel_1mes', 'canal_entrada', 'nomprov']
    df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, 
                       columns=categorical_vars, sparse=False, drop_first=False)
    
    # remove variables with one value, if any
    for col in df.columns:
        if len(df[col].value_counts()) == 1:
            print(col)
            del df[col]
            
    return df

In [112]:
def split(df_train):
    # separate the labels
    labels = []
    for col in df_train.columns:
        if col[:4] == 'ind_' and col[-4:] == 'ult1':
            labels.append(col)

    # create X and y delete dataframe
    X = df_train[df_train.columns.difference(labels)]
    y = df_train[labels]
    del df_train
    return X,y, labels

In [113]:
df_train = pd.read_csv('train_ver2.csv', nrows=50000)

In [114]:
X,y, labels = split(transform(df_train))

indresi
ind_ahor_fin_ult1
ind_aval_fin_ult1
ind_cder_fin_ult1
ind_ctop_fin_ult1
ind_ctpp_fin_ult1


In [115]:
# need to join all into one array
# can a dataframe store an array?

# label them with the name
#for label in labels:`
#    y.ix[y[label]==1, label] = label
#    y.ix[y[label]==0, label] = ''    

In [116]:
# compact the labels
# results = y.apply(lambda x: re.sub('\s+', ' ', ' '.join(x)).strip(), axis=1)

In [117]:
# seems like a good way of joining

### Actual prediction

In [6]:
# upload test data
X_test = pd.read_csv('test_ver2.csv')

# initialize results
report = pd.DataFrame(X_test['ncodpers'])
classif_results = {}

# prepare test data for classifer
X_test = transform(X_test, fillna=True)


In [7]:
# X_test should only have columns that are also in X (needed due to one-hot encoding)
paired_columns = [col for col in X_test.columns if col in X.columns]
X_test = X_test[paired_columns]

In [8]:
# predict each product with a different clssifer
for label in labels:
   if len(y[label].value_counts()) != 1:
       clf = Tree()
       clf.fit(X, y[label])
       classif_results[label] = clf.predict(X_test)

In [9]:
# clean memory
del X
del y
del X_test

In [39]:
# transform results to expected output
fn_name_labels = lambda label, pred: list(map(lambda x: label if x else '', pred))
cf_list = [fn_name_labels(k,v) for k,v in classif_results.items()]

# concatenate results
fn_join_columns = lambda x:re.sub('\s+', ' ', ' '.join(x)).strip()

# add new column added products in report
report['added_products'] = list(map(fn_join_columns, zip(*cf_list)))

In [41]:
report.ix[0, 'added_products']

'ind_cco_fin_ult1 ind_ctma_fin_ult1'

In [42]:
report.to_csv('round1.csv', header=True, index=False)