In [3]:
#=========================================LOGGING
import logging
# create logger
logging.basicConfig(filename='loglog.log',level=logging.DEBUG, format="%(asctime)s; %(levelname)s;  %(message)s")
logger = logging.getLogger("trainlo")
logger.setLevel(logging.DEBUG)

def info(msg):
    logger.info(msg.replace("\n", "  "))
#=========================================LOGGING

import sys
import pandas as pd
import numpy as np
from time import time
from scipy.optimize import fmin_powell
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier, SGDRegressor, Perceptron
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVC, SVR
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from persistent_cache import memo, PersistentDict as Perd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)


num_classes = 8

print("Load the data using pandas")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# combine train and test
all_data = train.append(test)

# factorize categorical variables    
all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
# FEATURE ENGINEERING
all_data['bmi_ins_age'] = all_data.BMI * all_data.Ins_Age
all_data['nan_count'] = all_data.isnull().sum(axis=1)
all_data['emp_inf_4_sq'] = all_data.Employment_Info_4 ** 2
all_data['fam_hist_4_sq'] = all_data.Family_Hist_4 ** 2
all_data['fam_hist_2_sq'] = all_data.Family_Hist_2 ** 2

mk = [col for col in train.columns if col.startswith("Medical_K")]
all_data['sum_keywords'] = sum(train[col] for col in mk)

all_data.drop('Medical_History_24')
all_data.drop('Medical_History_10')

print('Eliminate missing values')    
# Use -1 for any others
all_data.fillna(-1, inplace=True)

# fix the dtype on the label column
all_data['Response'] = all_data['Response'].astype(int)

# Provide split column
# all_data['Split'] = np.random.randint(5, size=all_data.shape[0])

# split train and test
train = all_data[all_data['Response']>0].copy()
test = all_data[all_data['Response']<1].copy()

Load the data using pandas
Eliminate missing values


In [8]:
train.drop("Medical_Histo")

3

In [4]:
import scipy
def corr(x, y):
    return scipy.stats.pearsonr(x, y)[0]

In [10]:
for c in train.columns:
    c1, c2, c3, c4 = corr(train[c], train.Response), corr(train[c]**2, train.Response), corr(train[c]**3, train.Response), corr(train[c]**4, train.Response)
    ac1, ac2, ac3, ac4 = map(abs, [c1, c2, c3, c4])
    m = max(ac1, ac2, ac3, ac4)
    if m > 1.05* ac1:
        print "\t".join("%.3f" % co for co in [c1, c2, c3, c4]), c

0.012	-0.015	0.014	-0.014 Employment_Info_4
0.167	-0.178	0.166	-0.178 Family_Hist_2
0.157	-0.189	0.158	-0.184 Family_Hist_4
0.000	-0.000	0.000	-0.001 Id
-0.000	-0.002	-0.004	-0.005 Insurance_History_4
-0.006	-0.007	-0.007	-0.007 Insurance_History_7
0.007	0.009	0.009	0.010 Insurance_History_8
-0.006	-0.007	-0.007	-0.008 Insurance_History_9
0.036	0.042	0.045	0.047 Medical_History_11
-0.001	-0.001	-0.001	-0.001 Medical_History_25
-0.046	-0.054	-0.058	-0.061 Medical_History_8
-0.022	0.001	0.021	0.029 Product_Info_2


In [6]:
for c in train.columns:
    c1, c2 = corr(train[c], train.Response), corr(train[c]**3, train.Response)
    if abs(c2) > 1.05* abs(c1):
        print "%.3f   %.3f" % (c1, c2), c

0.012   0.014 Employment_Info_4
-0.000   -0.004 Insurance_History_4
-0.006   -0.007 Insurance_History_7
0.007   0.009 Insurance_History_8
-0.006   -0.007 Insurance_History_9
0.036   0.045 Medical_History_11
-0.001   -0.001 Medical_History_25
-0.046   -0.058 Medical_History_8


In [7]:
for c in train.columns:
    c1, c2 = corr(train[c], train.Response), corr(train[c]**4, train.Response)
    if abs(c2) > 1.05* abs(c1):
        print "%.3f   %.3f" % (c1, c2), c

0.012   -0.014 Employment_Info_4
0.167   -0.178 Family_Hist_2
0.157   -0.184 Family_Hist_4
0.000   -0.001 Id
-0.000   -0.005 Insurance_History_4
-0.006   -0.007 Insurance_History_7
0.007   0.010 Insurance_History_8
-0.006   -0.008 Insurance_History_9
0.036   0.047 Medical_History_11
-0.001   -0.001 Medical_History_25
-0.046   -0.061 Medical_History_8
-0.022   0.029 Product_Info_2


In [None]:
corr(train.BMI, train.Response), corr(train.Ins_Age, train.Response), corr(train.BMI * train.Ins_Age, train.Response)

In [None]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
set(train.Insurance_History_2)

In [None]:
tbmi = train.BMI

In [None]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 40
train['NaN_count'] = train.isnull().sum(axis=1)
train[pd.isnull(train.Medical_History_1)]

In [17]:
corr(train.Medical_History_8 ** 8, train.Response)

-0.063691052804812076

In [19]:
set(train.Medical_History_8)

{1, 2, 3}

In [26]:
mk = [col for col in train.columns if col.startswith("Medical_K")]

In [22]:
set(train.Medical_Keyword_21)

{0, 1}

In [30]:
train.Family_Hist_2

0    -1.000000
1     0.188406
2     0.304348
3     0.420290
4     0.463768
5    -1.000000
6     0.594203
7    -1.000000
8    -1.000000
9     0.797101
10   -1.000000
11    0.405797
12   -1.000000
13    0.420290
14   -1.000000
...
59366   -1.000000
59367   -1.000000
59368   -1.000000
59369   -1.000000
59370   -1.000000
59371   -1.000000
59372    0.681159
59373    0.275362
59374    0.405797
59375   -1.000000
59376    0.217391
59377    0.565217
59378    0.173913
59379   -1.000000
59380   -1.000000
Name: Family_Hist_2, Length: 59381, dtype: float64