In [2]:
###Just a graph showing 2 separable PCAs
import itertools

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

def principal_component_analysis(x_train):

    """
    Principal Component Analysis (PCA) identifies the combination
    of attributes (principal components, or directions in the feature space)
    that account for the most variance in the data.

    Let's calculate the 2 first principal components of the training data,
    and then create a scatter plot visualizing the training data examples
    projected on the calculated components.
    """

    # Extract the variable to be predicted
    y_train = x_train["TARGET"]
    x_train = x_train.drop(labels="TARGET", axis=1)
    classes = np.sort(np.unique(y_train))
    labels = ["Satisfied customer", "Unsatisfied customer"]

    # Normalize each feature to unit norm (vector length)
    x_train_normalized = normalize(x_train, axis=0)
    
    # Run PCA
    pca = PCA(n_components=2)
    x_train_projected = pca.fit_transform(x_train_normalized)

    # Visualize
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(1, 1, 1)
    colors = [(0.0, 0.63, 0.69), 'black']
    markers = ["o", "D"]
    for class_ix, marker, color, label in zip(
            classes, markers, colors, labels):
        ax.scatter(x_train_projected[np.where(y_train == class_ix), 0],
                   x_train_projected[np.where(y_train == class_ix), 1],
                   marker=marker, color=color, edgecolor='whitesmoke',
                   linewidth='1', alpha=0.9, label=label)
        ax.legend(loc='best')
    plt.title(
        "Scatter plot of the training data examples projected on the "
        "2 first principal components")
    plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[0] * 100.0))
    plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[1] * 100.0))
    plt.show()

    plt.savefig("pca.pdf", format='pdf')
    plt.savefig("pca.png", format='png')


def remove_feat_constants(data_frame):
    # Remove feature vectors containing only one value,
    #since they are worthless for prediction. 
    print("")
    print("Deleting zero variance features...")
    # We retrieve zero variance features by fitting VarianceThreshold
    # selector to the data but don't transform the data with
    # the selector because it will also transform our pd dataframe into
    # Numpy array and we would like to keep the pd dataframe. Therefore,
    # we delete the useless features manually.
    n_features_originally = data_frame.shape[1]
    selector = VarianceThreshold()
    selector.fit(data_frame)
    # get the indices of zero variance features
    feat_ix_keep = selector.get_support(indices=True)
    orig_feat_ix = np.arange(data_frame.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
    # delete zero variance features from the original pd frame
    data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                 axis=1)
    # Print info
    n_features_deleted = feat_ix_delete.size
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame


def remove_feat_identicals(data_frame):
    # Find feature vectors having the same values in the same order and
    # remove all but one of those redundant features.
    print("")
    print("Deleting identical features...")
    n_features_originally = data_frame.shape[1]
    # Find the names of identical features by going through all the
    # combinations of features (each pair is compared only once).
    feat_names_delete = []
    for feat_1, feat_2 in itertools.combinations(
            iterable=data_frame.columns, r=2):
        if np.array_equal(data_frame[feat_1], data_frame[feat_2]):
            feat_names_delete.append(feat_2)
    feat_names_delete = np.unique(feat_names_delete)
    # Delete the identical features
    data_frame = data_frame.drop(labels=feat_names_delete, axis=1)
    n_features_deleted = len(feat_names_delete)
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame


In [3]:
# if __name__ == "__main__":
#     x_train = remove_feat_constants(x_train)
#     x_train = remove_feat_identicals(x_train)

# principal_component_analysis(x_train)

In [4]:
###inspecting the most important feature, var3
from sklearn.neighbors import KNeighborsClassifier
training = pd.read_csv('/Users/bohun/Documents/kaggleproject/santandertrain.csv', index_col=0)

X = training.iloc[:,:-1]
y = training.TARGET

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

##select the most important 220 features
selectK = SelectKBest(f_classif, k=220)
selectK.fit(X, y)
X_sel = selectK.transform(X)

features = X.columns[selectK.get_support()]

##treating -999999 as missing data, let's impute the new value with k-nearest neighbors. 
X_train = training.loc[training['var3'] != -999999, features[1:]]
y_train = training.loc[training['var3'] != -999999, 'var3']
X_test = training.loc[training['var3'] == -999999, features[1:]]
kclf = KNeighborsClassifier(n_neighbors=20)
kclf.fit(X_train, y_train)
yvar3 = kclf.predict(X_test)
yvar3
##we should replace -999999 with 2. 

 189 192 220 222 234 238 244 248 261 262 303 307 315 319 327 349] are constant.


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2])

In [5]:
#merge, clean data and imputing for var3
import itertools
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

x_train = pd.read_csv('/Users/bohun/Documents/kaggleproject/santandertrain.csv')
y_train = x_train["TARGET"]
target = np.array(y_train)

x_train = x_train.drop(labels=["TARGET","ID"], axis=1)

x_test = pd.read_csv('/Users/bohun/Documents/kaggleproject/santandertest.csv')
y_ID = x_test["ID"]
x_test.drop(labels=["ID"], axis=1, inplace=True)

fulldata = x_train.append(x_test)
fulldata['var3'].replace(-999999, 2, inplace=True)
fulldata = remove_feat_constants(fulldata)
fulldata = remove_feat_identicals(fulldata)


Deleting zero variance features...
  - Deleted 34 / 369 features (~= 9.2 %)

Deleting identical features...
  - Deleted 27 / 335 features (~= 8.1 %)


In [6]:
##logistic regression on the training set
train = fulldata.iloc[0:76020, ]
test = fulldata.iloc[76020:, ]

train_normalized = normalize(train, axis=0)
test_normalized = normalize(test, axis=0)

print (train_normalized.shape, test_normalized.shape)

pca = PCA(n_components=3).fit(train_normalized)
train_projected = pca.fit_transform(train_normalized)
test_projected = pca.transform(test_normalized) # transformation of test set into our PCAs

print (train_projected.shape, test_projected.shape, y_train.shape)

import sklearn.linear_model as lm
logit = lm.LogisticRegression(C=1e5,  class_weight='balanced') 
logit.fit(train_projected, y_train)
logit.score(train_projected, y_train)

((76020, 308), (75818, 308))
((76020, 3), (75818, 3), (76020,))


0.20259142330965535

In [200]:
##SVM with balanced classes on top 3 PCs
from sklearn import svm
wclf = svm.SVC(kernel='poly', degree = 3, class_weight='balanced', C = 0.1)
wclf.fit(train_projected, y_train)
print wclf.n_support_
print wclf.score(train_projected, y_train)

[73012  3008]
0.0395685345962


In [28]:
##Gaussian Naive Bayes on top 3 PCs
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(train_projected, target , test_size=0.3, stratify = target)
from sklearn import naive_bayes
gnb = naive_bayes.GaussianNB()
gnb.fit(X_train, y_train)
print gnb.score(X_train, y_train)
print gnb.score(X_test, y_test)


##fitting gnb on the full data set##
gnb.fit(train_projected, target)


print gnb.predict_proba(test_projected).shape

0.952625248995
0.952775585372
(75818, 2)


In [5]:
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(train_projected, target , test_size=0.3, stratify = target)

parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['poly'], 'degree': [3,4,5], 'C': [1e-4,1e-2,1e-1]}]
grclf = GridSearchCV(svm.SVC(class_weight='balanced'), parameters, cv=5, verbose = 2, n_jobs=12)
grclf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(grclf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in grclf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  9.6min
[Parallel(n_jobs=12)]: Done  75 out of  75 | elapsed: 27.3min finished


Best parameters set found on development set:
()
{'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
()
Grid scores on development set:
()
0.408 (+/-0.902) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.408 (+/-0.902) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.776 (+/-0.737) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.776 (+/-0.737) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.114 (+/-0.022) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.592 (+/-0.902) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.592 (+/-0.902) for {'kernel': 'poly', 'C': 0.0001, 'degree': 3}
0.592 (+/-0.902) for {'kernel': 'poly', 'C': 0.0001, 'degree': 4}
0.592 (+/-0.902) for {'kernel': 'poly', 'C': 0.0001, 'degree': 5}
0.408 (+/-0.902) for {'kernel': 'poly', 'C': 0.01, 'degree': 3}
0.408 (+/-0.902) for {'kernel': 'poly', 'C': 0.01, 'degree': 4}
0.408 (+/-0.902) for {'kernel': 'poly', 'C': 0.01, 'degree': 5}
0.408 (+/-0.902) for {'kernel': 'poly', 'C': 0.1, 'degree': 3}
0.408 (+/-0.902) for {'kernel'

In [7]:
y_true, y_pred = y_test, grclf.predict(X_test)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00     21904
          1       0.04      1.00      0.08       902

avg / total       0.00      0.04      0.00     22806



  'precision', 'predicted', average, warn_for)


In [14]:
print grclf.best_score_ 
print grclf.best_estimator_

0.77626188597
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [30]:
pred = gnb.predict_proba(test_projected)
print pred.shape
print test_projected.shape
print y_ID.shape
submission = pd.DataFrame({"ID":y_ID, "TARGET":np.array([y for (x,y) in pred])})
submission.to_csv("/Users/bohun/Documents/kaggleproject/submission.csv", index=False)
print submission[1:10]

(75818, 2)
(75818, 3)
(75818,)
   ID        TARGET
1   5  8.124306e-02
2   6  8.170998e-02
3   7  8.414321e-02
4   9  8.231059e-02
5  11  8.405969e-02
6  12  3.344488e-15
7  15  8.441928e-02
8  16  3.520436e-02
9  17  7.783856e-02


In [10]:
###NARROWED GRID###
X_train, X_test, y_train, y_test = train_test_split(train_projected, target , test_size=0.3, stratify = target)

narrowparameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-2],
                     'C': [0.1, 0.5, 5]}]
nrclf = GridSearchCV(svm.SVC(class_weight='balanced'), param_grid = narrowparameters, cv=5, verbose = 2, n_jobs=30)
nrclf.fit(X_train, y_train)
print nrclf.best_score_ 
print nrclf.best_estimator_


Fitting 5 folds for each of 6 candidates, totalling 30 fits
0.77626188597
SVC(C=5, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] kernel=rbf, C=0.1, gamma=0.01 .........................

[Parallel(n_jobs=30)]: Done  30 out of  30 | elapsed: 24.1min finished


In [20]:
print grclf.best_score_ 
print grclf.best_estimator_
y_true, y_pred = y_test, grclf.predict(X_test)
print(classification_report(y_true, y_pred))

0.77626188597
SVC(C=5, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00     21904
          1       0.04      1.00      0.08       902

avg / total       0.00      0.04      0.00     22806



In [11]:
print nrclf.best_score_ .
print nrclf.best_estimator_
y_true, y_pred = y_test, nrclf.predict(X_test)
print(classification_report(y_true, y_pred))

0.77626188597
SVC(C=5, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00     21904
          1       0.04      1.00      0.08       902

avg / total       0.00      0.04      0.00     22806



  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn import svm

pred = svm.SVC(C=5, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False).fit(train_projected, target).predict_proba(test_projected)

print pred.shape
print test_projected.shape
print y_ID.shape
submission = pd.DataFrame({"ID":y_ID, "TARGET":np.array([y for (x,y) in pred])})
submission.to_csv("/Users/bohun/Documents/kaggleproject/submission.csv", index=False)
print submission[1:10]