In [192]:
import numpy as np
import pandas as pd
from sklearn import preprocessing as prep
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import LinearSVC as LSVC

## Read in the data

In [193]:
data = pd.read_csv('data/train.csv')
    
pd.DataFrame(data.select_dtypes(include=['object'])).describe()

Unnamed: 0,v3,v22,v24,v30,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v113,v125
count,110864,113821,114321,54211,110864,114321,114318,107439,114321,114321,114321,114321,114321,114318,114318,114321,113939,59017,114244
unique,3,18210,5,7,3,10,12,122,3,9,3,4,18,7,7,3,22,36,90
top,C,AGDF,E,C,A,C,J,BW,A,F,B,D,C,A,E,A,F,G,BM
freq,110584,2386,55177,32178,88347,55425,11103,11351,70353,75094,113560,75087,34561,27079,27079,55688,21671,16252,5759


In [194]:
test = pd.read_csv('data/test.csv')

pd.DataFrame(test.select_dtypes(include=['object'])).describe()

Unnamed: 0,v3,v22,v24,v30,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v113,v125
count,110901,113916,114393,54051,110901,114393,114391,107475,114393,114393,114393,114393,114393,114391,114391,114393,113980,59035,114309
unique,3,18252,5,7,3,9,12,116,3,9,3,4,17,7,7,3,22,36,90
top,C,AGDF,E,C,A,C,J,BW,A,F,B,D,C,A,E,A,F,G,BM
freq,110640,2391,55179,32326,88590,55428,11349,11561,70375,75295,113668,75288,34705,26947,26947,55727,21717,16208,5855


## Fill in the NaN's

In [195]:
# taken from http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn
#updated for ints

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() if X[c].dtype == np.dtype('float64') else X[c].median() 
                               for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
dataFilled = DataFrameImputer().fit_transform(data)
testFilled = DataFrameImputer().fit_transform(test)

## Transform categorical into integers

In [196]:
for col in dataFilled:
    if dataFilled[col].dtype == 'O':
        le = prep.LabelEncoder()
        le.fit(dataFilled[col])
        trns = le.transform(dataFilled[col])
        dataFilled.loc[:,col] = trns
        # !!! finish for the test set !!!
        try:
            trns = le.transform(testFilled[col])
            testFilled.loc[:,col] = trns
        except ValueError:
            pass

dataFilled.head()

ValueError: y contains new labels: ['A' 'AAAD' 'AAAI' ..., 'ZZT' 'ZZX' 'ZZY']

In [187]:
keys = set(dataFilled.keys())

keys.remove('ID')
keys.remove('target')

X = dataFilled[list(keys)]
scX = prep.scale(X)
y = dataFilled['target']

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

lr = LR(n_jobs = 4,max_iter = 2000)
lr.fit(X_train,y_train)

lsvc = LSVC()
lsvc.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [189]:
print "Linear regression score: %0.3f" % (lr.score(X_test, y_test))
print "SVM score: %0.3f" %(lsvc.score(X_test,y_test))

Linear regression score: 0.767
SVM score: 0.762


In [190]:
lr.predict_proba(X_test)

array([[ 0.13555987,  0.86444013],
       [ 0.31766775,  0.68233225],
       [ 0.46959273,  0.53040727],
       ..., 
       [ 0.0861855 ,  0.9138145 ],
       [ 0.36122978,  0.63877022],
       [ 0.11852804,  0.88147196]])

In [191]:
lsvc.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])