In [1]:
# import useful stuff
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import re
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score

# avoid undefined metric warning when calculating precision with 0 labels defined as 1
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import SGDClassifier

### Data transformations

1. Load data
2. Transform data
3. Create features and lables
4. Re-select features
5. Train
6. Cross-validate

#### Load Data

In [2]:
def gen_data(test=None):

    if test:
        df = pd.read_csv('test_ver2.csv')
    else: 
        df = pd.read_csv('train_ver2.csv')
    
    # separate the labels
    labels = []
    for col in df.columns:
        if col[:4] == 'ind_' and col[-4:] == 'ult1':
            labels.append(col)

    # create X and y delete dataframe
    X = df[df.columns.difference(labels)]
    y = df[labels].fillna(value=0) # NAs in labels will be considered 0
    del df

    return X,y, labels

In [3]:
X, y, labels =  gen_data()

In [4]:
X.shape

(13647309, 24)

#### Transform data

In [None]:
def transform(df, fillna=True):
    """ This version includes variables considered relevant"""
    
    ### variables to be removed ###
    # remove cod_prov only, since it is redundant with nomprov
    # removed fecha_alta - redundant with antiguedad
    for col in ['cod_prov', 'fecha_alta']:
        del df[col]    

    ### numerical_vars ###
    # convert numerical vars to int
    numerical_vars = ['age', 'antiguedad', 'renta']
    df[numerical_vars] = df[numerical_vars].convert_objects(convert_numeric=True)
    
    # change less or equal than 0 to nan
    for var in numerical_vars:
        df.ix[df[var] < 0, var] = np.nan

    ### boolean and categorical vars ###
    # convert S/N to boolean and remaining to number
    boolean_vars = ['indfall', 'ind_actividad_cliente', 'ind_nuevo', 'indresi', 'indext', 
                    'tipodom', 'conyuemp', 'ind_actividad_cliente']
    for var in ['indfall', 'indresi', 'indext', 'conyuemp']:
        df[var] = df[var] == 'S'
    df[boolean_vars] = df[boolean_vars].convert_objects(convert_numeric=True)
        
    # one hot encode categorical vars
    # 150 canais, 103 paises, 52 provincias
    categorical_vars = ['segmento', 'sexo', 'tiprel_1mes', 'canal_entrada', 'nomprov', 
                        'ind_empleado', 'indrel_1mes', 'ult_fec_cli_1t', 'pais_residencia']
    df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, 
                       columns=categorical_vars, sparse=False, drop_first=False)    

    
    ### handling null values ###
    if fillna:
        df = df.fillna(value=0)
    else:
        df = df.dropna()
        
    ### end ### 
            
    return df

In [None]:
X = transform(X)
y = y.loc[X.index]

#### Generate features and labels

I want to capture evolution. So it would be current month - past months. I can do this for past 6 months. That means only after some date I will be able to generate data.

At this point, there are two types of data in the dataset:
* Numerical: for these, I will only evaluate if it increased, or decreased, hence converting to three categories: decreased (-1), unchanged (0), increased(1).
* Boolean: Take current status minus previous status. If it has changed from 0 to 1, it will be 1, as in added. If changed from 1 to, it will be -1, as in removed. If it is 0, it means unchanged. 

In [None]:
X[0]

In [None]:
# tra

In [7]:
df = pd.read_csv('test_ver2.csv')

In [None]:
# get 100,000 
nrows = df.shape[0]
indices = np.random.permutation(nrows)
ids = df.ix[indices[:100000], 'ncodpers']

In [5]:
X,y, labels = gen_data()
X = transform(X)
y = y.loc[X.index]

### Get Deltas

### Scale Numerical Features

In [6]:
# scale numerical_vars from 0 to 1
numerical_vars = ['age', 'antiguedad', 'renta', 'ncodpers']
scaler = MinMaxScaler()
X[numerical_vars] = scaler.fit_transform(X[numerical_vars])

### Initializing classifiers 

In [7]:
models = []
for label in labels:
    # select features
    selector = SelectKBest()
    selector.fit(X,y[label])
    features = list(list(zip(*sorted(zip(X.columns, selector.scores_), key=lambda x:-x[1])))[0])
    
    #classify    
    clf = SGDClassifier()
    cv = StratifiedShuffleSplit()
    scores = cross_val_score(clf, X[features[:12]], y[label], cv=cv, scoring='f1')
    score_est = scores.mean()-scores.std()
    
    # store
    models.append((label, SGDClassifier(), features, score_est))
    print("Label {}: {:.2f}".format(label, score_est.mean()))

Label ind_ahor_fin_ult1: 0.00
Label ind_aval_fin_ult1: 0.00
Label ind_cco_fin_ult1: 0.79
Label ind_cder_fin_ult1: 0.00
Label ind_cno_fin_ult1: 0.00
Label ind_ctju_fin_ult1: 0.61
Label ind_ctma_fin_ult1: 0.00
Label ind_ctop_fin_ult1: 0.00
Label ind_ctpp_fin_ult1: 0.00
Label ind_deco_fin_ult1: 0.00
Label ind_deme_fin_ult1: 0.00
Label ind_dela_fin_ult1: 0.49
Label ind_ecue_fin_ult1: 0.00
Label ind_fond_fin_ult1: 0.00
Label ind_hip_fin_ult1: 0.00
Label ind_plan_fin_ult1: 0.00
Label ind_pres_fin_ult1: 0.40
Label ind_reca_fin_ult1: 0.00
Label ind_tjcr_fin_ult1: 0.00
Label ind_valo_fin_ult1: 0.00
Label ind_viv_fin_ult1: 0.00
Label ind_nomina_ult1: 0.00
Label ind_nom_pens_ult1: 0.00
Label ind_recibo_ult1: 0.00


### Further training all classifiers

In [8]:
# sorted(filter(lambda v:v[1][2]>.3, models.items()), key=lambda x:-x[1][2])
ordered_models = sorted(models, key=lambda x:-x[3])

In [9]:
# init scaler
scaler = MinMaxScaler()
n_features = 6

# loop chunks of data
for i, chunk in enumerate(chunks):
    print(i, len(chunk))
    X,y, labels = gen_data(chunk)
    X = transform(X)
    y = y.loc[X.index]
    print(X.shape)
    scaler.partial_fit(X[numerical_vars])
    X[numerical_vars] = scaler.transform(X[numerical_vars])
    
    # loop models/labels
    for j, model in enumerate(ordered_models):
        label, clf, features, score = model
        
        # exception: if first chunk, create new classifier
        if i == 0: 
            clf = SGDClassifier()
            ordered_models[j] = (label, clf, features, score)
            
        print(label)
        
        # select features and partial fit classifier
        clf.partial_fit(X[features[:n_features]],y[label], classes=[0,1])

0 1705913
(1356361, 252)
ind_cco_fin_ult1
ind_ctju_fin_ult1
ind_dela_fin_ult1
ind_pres_fin_ult1
ind_ahor_fin_ult1
ind_aval_fin_ult1
ind_cder_fin_ult1
ind_cno_fin_ult1
ind_ctma_fin_ult1
ind_ctop_fin_ult1
ind_ctpp_fin_ult1
ind_deco_fin_ult1
ind_deme_fin_ult1
ind_ecue_fin_ult1
ind_fond_fin_ult1
ind_hip_fin_ult1
ind_plan_fin_ult1
ind_reca_fin_ult1
ind_tjcr_fin_ult1
ind_valo_fin_ult1
ind_viv_fin_ult1
ind_nomina_ult1
ind_nom_pens_ult1
ind_recibo_ult1
1 1705914
(1356678, 251)
ind_cco_fin_ult1
ind_ctju_fin_ult1
ind_dela_fin_ult1
ind_pres_fin_ult1
ind_ahor_fin_ult1
ind_aval_fin_ult1
ind_cder_fin_ult1
ind_cno_fin_ult1
ind_ctma_fin_ult1
ind_ctop_fin_ult1
ind_ctpp_fin_ult1
ind_deco_fin_ult1
ind_deme_fin_ult1
ind_ecue_fin_ult1
ind_fond_fin_ult1
ind_hip_fin_ult1
ind_plan_fin_ult1
ind_reca_fin_ult1
ind_tjcr_fin_ult1
ind_valo_fin_ult1
ind_viv_fin_ult1
ind_nomina_ult1
ind_nom_pens_ult1
ind_recibo_ult1
2 1705913
(1356230, 249)
ind_cco_fin_ult1
ind_ctju_fin_ult1
ind_dela_fin_ult1
ind_pres_fin_ult1
ind_ah

In [10]:
# reimport the data for remaining
nrows = 13647309
indices = np.random.permutation(nrows)

# divided data into 100 chunks, pick 5 for validation
# only the first will be used for feature selection. 
chunks_count = 100
csize = len(indices)/chunks_count
chunks = []
for i in range(chunks_count):
    chunks.append(indices[i*csize:(i+1)*csize])

In [11]:
for i, chunk in enumerate(chunks[:5]):
    X,y, labels = gen_data(chunk)
    X = transform(X)
    y = y.loc[X.index]
    print(X.shape)
    X[numerical_vars] = scaler.transform(X[numerical_vars])
    # loop models/labels
    for j, model in enumerate(ordered_models):
        label, clf, features, scores = model
        
        # exception: if first chunk, create array of scores
        if i == 0: 
            scores = []
            ordered_models[j] = (label, clf, features, scores)
            
        # select features and partial fit classifier
        y_pred = clf.predict(X[features[:n_features]])
        score = f1_score(y[label], y_pred)
        scores.append(score)

(108421, 236)
(108503, 235)
(108475, 238)
(108246, 234)
(108648, 236)


In [12]:
# calc mean precision
def calc_mean_score(t):
    mean_score = sum(t[3])/len(t[3])
    return (t[0], t[1], t[2], mean_score)

om = list(map(lambda x:calc_mean_score(x), ordered_models))

In [24]:
# filter only f1 scores above 40%
om = list(filter(lambda x:x[3]>.3, om))


In [25]:
# predict for training data
X,y, labels = gen_data(test=True)
results = pd.DataFrame(X['ncodpers'])
X = transform(X, fillna=True)
X[numerical_vars] = scaler.transform(X[numerical_vars])
for model in om:
    label, clf, features, scores = model
    results[label] = clf.predict(X[features[:n_features]])

In [26]:
# check count for each results
for col in results.columns - ['ncodpers']:
    print(results[col].value_counts())

1    929615
Name: ind_cco_fin_ult1, dtype: int64
0    926942
1      2673
Name: ind_ctju_fin_ult1, dtype: int64
0    894020
1     35595
Name: ind_dela_fin_ult1, dtype: int64
0    928412
1      1203
Name: ind_pres_fin_ult1, dtype: int64


In [27]:
# now I need to fetch the last row for these in the training_data
df = pd.read_csv('train_ver2.csv')

In [28]:
# checking if all ids from test are in training database
ids_from_train = df[df['fecha_dato']=='2016-05-28']['ncodpers']
ids_from_test = results['ncodpers']
len(ids_from_test) == len(set(ids_from_test).intersection(ids_from_train))

True

In [29]:
# get labels
labels = []
for col in df.columns:
    if col[:4] == 'ind_' and col[-4:] == 'ult1':
        labels.append(col)
print(len(labels))

24


In [30]:
df_last = df[df['ncodpers'].isin(ids_from_test)]
df_last = df_last[df_last['fecha_dato']=='2016-05-28'][['ncodpers']+labels]
df_last.shape

(929615, 25)

In [31]:
# sort both to align dataframes
# results.sort_values('ncodpers', inplace=True)
# df_last.sort_values('ncodpers', inplace=True)

# reindex instead of sorting
df_last = df_last.set_index('ncodpers')
results = results.set_index('ncodpers')

In [32]:
#get the added products
fresults = results.copy()
for col in results.columns - ['ncodpers']:
    fresults[col] = results[col] - df_last[col]

In [33]:
# export
# clean memory
del df
del df_last
del results

# transform results to expected output
for label in fresults.columns:
    fresults.ix[fresults[label]==1, label] = label
    fresults.ix[fresults[label]==0, label] = ''
    fresults.ix[fresults[label]==-1, label] = ''
    
# concatenate results
fn_join_columns = lambda x:re.sub('\s+', ' ', ' '.join(x)).strip()

# add new column added products in report
fresults['added_products'] = fresults.apply(fn_join_columns, axis=1)

#export
fresults['added_products'].to_csv('round5d.csv', header=True, index=True)


In [34]:
list(zip(*om))[3]



(0.79187271794662695,
 0.51708392045849338,
 0.49185205436065599,
 0.42183351124274271)