In [111]:
# import useful stuff
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import re
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score

# avoid undefined metric warning when calculating precision with 0 labels defined as 1
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import SGDClassifier

from time import time

### Data transformations

1. Load data
2. Transform data
3. Create features and lables
4. Re-select features
5. Train
6. Cross-validate

#### Load Data

In [112]:
def gen_data(test=None):

    if test:
        df = pd.read_csv('test_ver2.csv')
    else: 
        df = pd.read_csv('train_ver2.csv')
    
    # separate the labels
    labels = []
    for col in df.columns:
        if col[:4] == 'ind_' and col[-4:] == 'ult1':
            labels.append(col)

    # create X and y delete dataframe
    X = df[df.columns.difference(labels)]
    y = df[labels].fillna(value=0) # NAs in labels will be considered 0
    del df

    return X,y, labels

In [113]:
X, y, labels =  gen_data()

In [114]:
threshold = X['ncodpers'].quantile(.9)

In [115]:
X = X[X['ncodpers'] > threshold]
y = y.loc[X.index]

In [116]:
X.shape, y.shape

((1364729, 24), (1364729, 24))

#### Transform data

In [117]:
def transform(df, fillna=True):
    """ This version includes variables considered relevant"""
    
    ### variables to be removed ###
    # remove cod_prov only, since it is redundant with nomprov
    # removed fecha_alta - redundant with antiguedad
    for col in ['cod_prov', 'fecha_alta', 'ult_fec_cli_1t', 'pais_residencia']:
        del df[col]    

    ### numerical_vars ###
    # convert numerical vars to int
    numerical_vars = ['age', 'antiguedad', 'renta']
    df[numerical_vars] = df[numerical_vars].convert_objects(convert_numeric=True)
    
    # change less or equal than 0 to nan
    for var in numerical_vars:
        df.ix[df[var] < 0, var] = np.nan

    ### boolean and categorical vars ###
    # convert S/N to boolean and remaining to number
    boolean_vars = ['indfall', 'ind_actividad_cliente', 'ind_nuevo', 'indresi', 'indext', 
                    'tipodom', 'conyuemp', 'ind_actividad_cliente']
    for var in ['indfall', 'indresi', 'indext', 'conyuemp']:
        df[var] = df[var] == 'S'
    df[boolean_vars] = df[boolean_vars].convert_objects(convert_numeric=True)
        
    # one hot encode categorical vars
    # 150 canais, 103 paises, 52 provincias
    categorical_vars = ['segmento', 'sexo', 'tiprel_1mes', 'canal_entrada', 'nomprov', 
                        'ind_empleado', 'indrel_1mes']
    df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, 
                       columns=categorical_vars, sparse=False, drop_first=False)    

    
    ### handling null values ###
    if fillna:
        df = df.fillna(value=0)
    else:
        df = df.dropna()
        
    ### end ### 
            
    return df

In [118]:
X = transform(X)
y = y.loc[X.index]

#### Generate features and labels

I want to capture evolution. So it would be current month - past months. I can do this for past 6 months. That means only after some date I will be able to generate data.

At this point, there are two types of data in the dataset:
* Numerical: for these, I will only evaluate if it increased, or decreased, hence converting to three categories: decreased (-1), unchanged (0), increased(1).
* Boolean: Take current status minus previous status. If it has changed from 0 to 1, it will be 1, as in added. If changed from 1 to, it will be -1, as in removed. If it is 0, it means unchanged. 

In [119]:
# sort by ncodpers and fecha_dato
X = X.sort_values(['ncodpers', 'fecha_dato'])
y = y.loc[X.index]

In [120]:
# create differences
X_diff = X.drop(['fecha_dato'], axis=1).diff()
y_diff = y.diff()

# set index and ncodpers as column
X_diff['index'] = X_diff.index
X_diff['ncodpers'] = X['ncodpers']

In [121]:
first = X_diff.groupby('ncodpers').first().reset_index()['index']
last = X_diff.groupby('ncodpers').last().reset_index()['index']
len(first), len(last)

(162253, 162253)

In [122]:
# remove negative y
y_diff[y_diff < 0] = 0

In [123]:
# set training and validation set
X_train = X_diff[-X_diff.index.isin(first)]
X_val = X_diff[X_diff.index.isin(last)]
y_train = y_diff[-X_diff.index.isin(first)]
y_val = y_diff[X_diff.index.isin(last)]

In [124]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1202476, 105), (162253, 105), (1202476, 24), (162253, 24))

In [125]:
## evaluate
X_train.head()

Unnamed: 0,age,antiguedad,conyuemp,ind_actividad_cliente,ind_nuevo,indext,indfall,indrel,indresi,ncodpers,...,indrel_1mes_1,indrel_1mes_1.0,indrel_1mes_2,indrel_1mes_2.0,indrel_1mes_3,indrel_1mes_3.0,indrel_1mes_4,indrel_1mes_4.0,indrel_1mes_P,index
5058102,0.0,1.0,False,0.0,0.0,False,False,0.0,False,1362228,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5058102
6247377,0.0,1.0,False,0.0,0.0,False,False,0.0,False,1362228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6247377
6574995,0.0,1.0,False,0.0,0.0,False,False,0.0,False,1362228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6574995
7780192,1.0,1.0,False,0.0,0.0,False,False,0.0,False,1362228,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7780192
8956038,0.0,1.0,False,0.0,0.0,False,False,0.0,False,1362228,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8956038


In [126]:
y_train.head()

Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
5058102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6247377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6574995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7780192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8956038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preprocess

In [127]:
# scale features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [128]:
X_val = scaler.transform(X_val)

In [129]:
y_train = y_train.values
y_val = y_val.values

### Classification

Create a deep neural network in tensorflow to predict as a classifier. Output is a softmax prediction, with the probability of belonging to each class

Will this work if there are multiple classes????

In [134]:
import tensorflow as tf

In [135]:
# define width of each layer
layer_width = {
    'fc1': 1000,
    'fc2': 600,
    'fc3': 300,
    'out': y_train.shape[1]
}

In [136]:
# fixed parameters
n_classes = len(np.unique(y_train))
init_std = 0.03

# weights and biases
weights = {
    'fc1': tf.Variable(tf.truncated_normal([X_train.shape[1],layer_width['fc1']], 
                                           stddev=init_std), trainable=True),
    'fc2': tf.Variable(tf.truncated_normal([layer_width['fc1'],layer_width['fc2']], 
                                           stddev=init_std), trainable=True),
    'fc3': tf.Variable(tf.truncated_normal([layer_width['fc2'],layer_width['fc3']], 
                                           stddev=init_std), trainable=True),
    'out': tf.Variable(tf.truncated_normal([layer_width['fc3'],layer_width['out']], 
                                           stddev=init_std), trainable=True)
}

biases = {
    'fc1': tf.Variable(tf.truncated_normal([layer_width['fc1']], 
                                           stddev=init_std), trainable=True),
    'fc2': tf.Variable(tf.truncated_normal([layer_width['fc2']], 
                                           stddev=init_std), trainable=True),
    'fc3': tf.Variable(tf.truncated_normal([layer_width['fc3']], 
                                           stddev=init_std), trainable=True),
    'out': tf.Variable(tf.truncated_normal([layer_width['out']], 
                                           stddev=init_std), trainable=True)
}

In [137]:
# create neural net
def neural_net(x, weights, biases):
    
    # lay1
    fc1 = tf.add(tf.matmul(x, weights['fc1']), biases['fc1'])
    fc1 = tf.nn.relu(fc1)

    # lay1
    fc2 = tf.add(tf.matmul(fc1, weights['fc2']), biases['fc2'])
    fc2 = tf.tanh(fc2)

    # lay1
    fc3 = tf.add(tf.matmul(fc2, weights['fc3']), biases['fc3'])
    fc3 = tf.tanh(fc3)

    # lay1
    out = tf.add(tf.matmul(fc3, weights['out']), biases['out'])
    
    return out

In [146]:
# learning parameters
batch_size = 100000
training_epochs = 5
decay = .01

In [147]:
# graph input

# instead of batch_size, use None to allow for variable input
x = tf.placeholder(tf.float32, shape=(None, X_train.shape[1]))
y = tf.placeholder(tf.int32, shape=(None))
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(5e-2, global_step, decay_steps=X_train.shape[0]/batch_size, 
                                           decay_rate=.96, staircase=True)
logits = neural_net(x, weights, biases)

In [192]:
# loss, optimizer, and variables initialization 
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    logits, y, name='xentropy')
reg = .01
loss = (tf.reduce_mean(cross_entropy) + 
        reg * tf.nn.l2_loss(weights['fc1']) + 
        reg * tf.nn.l2_loss(weights['fc2']) + 
        reg * tf.nn.l2_loss(weights['fc3']) + 
        reg * tf.nn.l2_loss(weights['out']) + 
        reg * tf.nn.l2_loss(biases['fc1']) + 
        reg * tf.nn.l2_loss(biases['fc1']) + 
        reg * tf.nn.l2_loss(biases['fc1']) + 
        reg * tf.nn.l2_loss(biases['fc1'])) 

        
# optimizer
tf.scalar_summary(loss.op.name, loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss, global_step=global_step)

#evaluation function
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(logits,1 ))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

#init
init = tf.initialize_all_variables()

In [149]:
# launch graph
t0 = time()
sess = tf.Session()
sess.run(init)

# create session variables
feed_dict={
    x: None,
    y: None
}

num_samples = 50000

# sample train and validation set
np.random.seed(42)
indices_train = np.random.permutation(y_train.shape[0])[:num_samples]

# init scores
scores_train = []
scores_validate = []
losses = []
epochs = []

# training cycle
for epoch in range(1, training_epochs+1):

    # set size of batch
    total_batch = int(X_train.shape[0]/batch_size)+1

    # loop over batches
    for i in range(total_batch):                
        feed_dict[x] = X_train[i*batch_size:(i+1)*batch_size]
        feed_dict[y] = y_train[i*batch_size:(i+1)*batch_size]
        _, loss_value = sess.run([train_op, loss], feed_dict)

    # update last loss value
    losses.append(loss_value)

    # test model in training set
    feed_dict[x] = X_train[indices_train]
    feed_dict[y] = y_train[indices_train]
    acc = sess.run(accuracy, feed_dict) 
    scores_train.append(acc)

    # test model in validation set
    feed_dict[x] = X_val
    feed_dict[y] = y_val
    acc = sess.run(accuracy, feed_dict) 
    scores_validate.append(acc)

    print("Epoch: {:0>4}, Cost: {:.8f}, Acc@Training: {:.3f}, Acc@Validate: {:.3f}".format(
            (epoch), losses[-1], scores_train[-1], scores_validate[-1]))        
#     if epoch>1:
#         #if scores_train[-1] >= 1:
#         if (scores_validate[-1] < scores_validate[-2]) or (scores_validate[-1] > .998):
#             break

print("Optimization Finished! Time to complete: {:.2f}".format(time()-t0))

Epoch: 0001, Cost: 1.66181552, Acc@Training: 1.000, Acc@Validate: 1.000
Epoch: 0002, Cost: 1.61530209, Acc@Training: 1.000, Acc@Validate: 1.000
Optimization Finished! Time to complete: 212.14


In [189]:
# get probability distribution for new images
softmax_pred = tf.nn.top_k(tf.nn.softmax(logits), 7)
feed_dict[x] = X_val
feed_dict[y] = y_val
classes = sess.run(softmax_pred, feed_dict) 

In [None]:
for i in range(10):
    print(zip(classes.values[i], classes.indices[i]))

In [188]:
# y_diff.columns[[2, 21, 23, 16, 11]]

Index(['ind_cco_fin_ult1', 'ind_nomina_ult1', 'ind_recibo_ult1',
       'ind_pres_fin_ult1', 'ind_dela_fin_ult1'],
      dtype='object')