In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.cross_validation import train_test_split

In [18]:
def gini(solution, submission):
    df = zip(solution, submission)
    df = sorted(df, key=lambda x: (x[1],x[0]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    solution=np.array(solution)
    submission=np.array(submission)
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [55]:
train_df = pd.read_csv("data/train.csv", index_col=0)
test_df = pd.read_csv("data/test.csv", index_col=0)

In [20]:
train_df.head()

Unnamed: 0,Id,Hazard,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,...,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
0,1,1,15,3,2,N,B,N,B,B,...,2,37,1,11,6,Y,N,E,2,2
1,2,4,16,14,5,H,B,N,B,B,...,2,22,1,18,5,Y,Y,E,2,1
2,3,1,10,10,5,N,K,N,B,B,...,6,37,2,14,6,Y,Y,E,6,1
3,4,1,18,18,5,N,K,N,B,B,...,2,25,1,1,6,Y,N,C,2,6
4,5,1,13,19,5,N,H,N,B,B,...,1,22,1,2,7,N,N,E,1,1


In [36]:
test_df.describe()

Unnamed: 0,Id,T1_V1,T1_V2,T1_V3,T1_V10,T1_V13,T1_V14,T2_V1,T2_V2,T2_V4,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V14,T2_V15
count,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0,51000.0
mean,51070.270725,9.711294,12.884667,3.191059,7.049843,13.992451,1.584314,57.699647,12.450843,10.22051,1.95202,33.488765,1.032686,12.509745,4.482784,2.455412,3.482608
std,29574.299063,5.171077,6.230218,1.739954,3.592225,4.661216,0.864759,23.48859,4.821524,4.843046,0.792789,5.818182,0.193955,7.32902,1.894225,1.255015,3.06552
min,6.0,1.0,1.0,1.0,2.0,5.0,0.0,1.0,1.0,1.0,1.0,22.0,1.0,1.0,1.0,1.0,1.0
25%,25336.75,6.0,7.0,2.0,3.0,10.0,1.0,40.0,9.0,6.0,2.0,31.0,1.0,6.0,3.0,2.0,1.0
50%,51023.5,9.0,14.0,3.0,8.0,15.0,1.0,56.0,11.0,10.0,2.0,34.0,1.0,14.0,4.0,2.0,2.0
75%,76730.25,14.0,18.0,4.0,12.0,20.0,2.0,78.0,15.0,14.0,2.0,40.0,1.0,18.0,6.0,3.0,5.0
max,101997.0,19.0,24.0,9.0,12.0,20.0,4.0,100.0,39.0,22.0,7.0,40.0,3.0,25.0,7.0,7.0,12.0


In [22]:
# Check if there's any NaN values in the dataframe
train_df.isnull().values.sum()

0

In [23]:
labels = train_df.Hazard

In [35]:
insurance_X_train = train_df.ix[:,["T1_V%d" % i for i in [1,2,3,10,13,14]] + ["T2_V%d" % j for j in [1,2,4,6,7,8,9,10,14,15]]].values
insurance_Y_train = train_df.Hazard
insurance_X_test = test_df.ix[:,["T1_V%d" % i for i in [1,2,3,10,13,14]] + ["T2_V%d" % j for j in [1,2,4,6,7,8,9,10,14,15]]].values

In [25]:
train_x, test_x, train_y, test_y = train_test_split(insurance_X_train, insurance_Y_train, test_size=.2)

In [26]:
clf = linear_model.LinearRegression()
clf.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [27]:
predictor_test = clf.predict(test_x)
predictor_train = clf.predict(train_x)

In [28]:
print predictor_test[:5]
print predictor_train[:5]

[ 4.1774566   3.68994828  5.20781928  5.42888019  3.52642517]
[ 2.71205837  4.71421743  4.446244    3.32614483  4.95278783]


In [29]:
print normalized_gini(train_y, predictor_train)
print normalized_gini(test_y, predictor_test)

0.206626066993
0.18415880745


### Using Linear Regression

Normalized Gini on train data: 0.206626066993 
Normalized Gini on test data: 0.18415880745 

In [52]:
predictors = clf.predict(insurance_X_test)
test_ind = test_df.index
print test_ind

Int64Index([6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18, 20, 27, 28, 29, 30, 34, 35, 36, 37, 38, 40, 42, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 60, 61, 64, 65, 66, 68, 70, 71, 75, 76, 78, 80, 81, 82, 83, 85, 86, 87, 89, 90, 91, 92, 95, 97, 99, 101, 102, 103, 104, 107, 108, 111, 112, 115, 119, 121, 124, 128, 130, 131, 133, 134, 137, 138, 142, 143, 144, 145, 147, 148, 152, 154, 158, 159, 160, 161, 164, 165, 166, 167, 169, 170, 171, 175, 177, 178, ...], dtype='int64')


In [53]:
print len(predictors)
print len(test_ind)

51000
51000


In [54]:
preds = pd.DataFrame({"Id": test_ind, "Hazard": predictors})
preds = preds.set_index('Id')
preds.to_csv('linregression_benchmark.csv')