In [2]:
# This is my personal data folder. Change to your own
dir = "/Users/piromast/Documents/Kaggle/SantanderData/"

In [3]:
# Import necessary stuff
from pandas import *
import numpy as np
import os.path
set_option("display.max_columns",50)

In [4]:
# Import data (first 1000 rows for now)
data=read_csv(os.path.join(dir,'train_ver2.csv'), nrows = 300000, skipinitialspace=True)

In [5]:
# Remove nans
#data.dropna(axis=1, how='all', inplace=True)
#data.dropna(how='any', inplace=True)
#data.head()

In [6]:
# Import description table (describing the meaning of each column)
description = read_csv(os.path.join(dir,'column_description.csv'),encoding = 'latin1')

In [7]:
# Select product column names and build indeces range
column_names = data.columns.values
product_names = [(i, s) for i, s in enumerate(column_names) if 'ind_' in s and '_ult1' in s]
product_indeces = [e[0] for e in product_names]
mInd = min(product_indeces)
MInd = max(product_indeces)
product_indeces_range = range(mInd,MInd)

In [8]:
# Build a dataframe only with products
product_df = data.iloc[:,product_indeces_range]
products_array = product_df.values

In [9]:
# Build a Series with number of products and use description as indeces
desclist=description.iloc[product_indeces_range,1].tolist()
num_products = Series(product_df.sum().tolist(),index=desclist)
num_products

Saving Account                  0.0
Guarantees                      1.0
Current Accounts           268266.0
Derivada Account               16.0
Payroll Account             16664.0
Junior Account               2150.0
Màs particular Account       7012.0
particular Account             39.0
particular Plus Account      1182.0
Short-term deposits           960.0
Medium-term deposits           99.0
Long-term deposits           6347.0
e-account                   18213.0
Funds                        1018.0
Mortgage                       28.0
Pensions                      349.0
Loans                          36.0
Taxes                        5073.0
Credit Card                  4928.0
Securities                   1395.0
Home Account                   13.0
Payroll                     10092.0
Pensions                    10884.0
dtype: float64

In [10]:
# Let's start scikit-learning
import sklearn.preprocessing, sklearn.pipeline

In [11]:
# Make dummy variable representation (onehot) for categorical variables
# Option 1: user label encoder on each column and onehot encoder on entire dataframe.
# This produces an encoding that will probably depend on the number of rows
pandas.options.mode.chained_assignment = None

le = sklearn.preprocessing.LabelEncoder()
oh = sklearn.preprocessing.OneHotEncoder()

# Select categorical variables
CatVariables = data.iloc[:,(description.iloc[:,2]==2).tolist()]

# Store label encoder for each column in a list (to undo encoding)
label_encoder_list = []

# Apply lable encoder to each column. Then apply onehot encoder to all the data array
for i, colname in enumerate(CatVariables):
    col = CatVariables[colname].tolist()
    
    CatVariables.iloc[:,i] = le.fit_transform(col)
    label_encoder_list.append(le)
    
DummyVariables = oh.fit_transform(CatVariables.values)

In [12]:
# # Make dummy variable representation (onehot) for categorical variables
# # Option 2: apply label encoder and onehot encoding on each column seperately
# # I use a pipeline here. All this doesn't work at the moment
# 
# pandas.options.mode.chained_assignment = None
# 
# le = sklearn.preprocessing.LabelEncoder()
# oh = sklearn.preprocessing.OneHotEncoder()
# encoder = sklearn.pipeline.Pipeline([('LabelEncoder',le),('OnehotEncoder',oh)])
# 
# # Select categorical variables
# CatVariables = data.iloc[:,(description.iloc[:,2]==2).tolist()]
# encoder_list = []
# for i, colname in enumerate(CatVariables):
#     le = sklearn.preprocessing.LabelEncoder()
#     oh = sklearn.preprocessing.OneHotEncoder()
#     #encoder = sklearn.pipeline.Pipeline([('LabelEncoder',le),('OnehotEncoder',oh)])
#     col = CatVariables[colname].tolist()
# 
#     CatVariables.iloc[:,i] = le.fit_transform(col)
# 
#     encoder_list.append(le)

In [31]:
# Option 3: Use pandas.get_dummies(). This seams to be the best option
CatVariables = data.iloc[:,(description.iloc[:,2]==2).tolist()]
DummyVariables=pandas.get_dummies(CatVariables)

In [32]:
# Build feature array including categorical features and continuous data
ContinuousVariables = data.iloc[:,(description.iloc[:,2]==1).tolist()].values
from sklearn import preprocessing
# Impute missing values
imputer = sklearn.preprocessing.Imputer(strategy="mean",axis=0)
ContinuousVariables = imputer.fit_transform(ContinuousVariables)

# Standardize continuous data
scaler = preprocessing.StandardScaler()
ContinuousVariables = scaler.fit_transform(ContinuousVariables)

# Build feature 
FeatureMatrix=hstack([DummyVariables, ContinuousVariables.astype(np.float64)])

FeatureMatrix = imputer.fit_transform(FeatureMatrix)

FeatureMatrix.shape

(300000, 188)

In [15]:
# Delete useless memory intensive stuff
del(DummyVariables)
del(ContinuousVariables)

In [16]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5, penalty='l1')
single_product = products_array[:,12]
logreg.fit(FeatureMatrix,single_product)
prediction = logreg.predict(FeatureMatrix)

In [17]:
from sklearn.metrics import classification_report
classification_report(single_product, prediction)

  'precision', 'predicted', average, warn_for)


'             precision    recall  f1-score   support\n\n        0.0       0.94      1.00      0.97    281787\n        1.0       0.00      0.00      0.00     18213\n\navg / total       0.88      0.94      0.91    300000\n'

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(FeatureMatrix, single_product, test_size=0.4, random_state=0)
tuned_parameters = [{'C': [1, 10, 100, 1000, 1e5, 1e6]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(logreg, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set:

{'C': 1}

Grid scores on development set:

0.470 (+/-0.000) for {'C': 1}
0.470 (+/-0.000) for {'C': 10}
0.470 (+/-0.000) for {'C': 100}
0.470 (+/-0.000) for {'C': 1000}
0.470 (+/-0.000) for {'C': 100000.0}
0.470 (+/-0.000) for {'C': 1000000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

        0.0       0.94      1.00      0.97    112733
        1.0       0.00      0.00      0.00      7267

avg / total       0.88      0.94      0.91    120000


# Tuning hyper-parameters for recall



  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'C': 1}

Grid scores on development set:

0.500 (+/-0.000) for {'C': 1}
0.500 (+/-0.000) for {'C': 10}
0.500 (+/-0.000) for {'C': 100}
0.500 (+/-0.000) for {'C': 1000}
0.500 (+/-0.000) for {'C': 100000.0}
0.500 (+/-0.000) for {'C': 1000000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

        0.0       0.94      1.00      0.97    112733
        1.0       0.00      0.00      0.00      7267

avg / total       0.88      0.94      0.91    120000




  'precision', 'predicted', average, warn_for)
