In [13]:
import pandas as pd
import numpy as np
import time
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.cross_validation import StratifiedKFold
from sklearn import grid_search, linear_model, svm

#
dum_cols = ['T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 'T1_V9', 
            'T1_V11', 'T1_V12', 'T1_V15', 'T1_V16', 'T1_V17', 
            'T2_V3', 'T2_V5', 'T2_V11', 'T2_V12', 'T2_V13']
dum_cols_names = ['Dum_'+a for a in dum_cols]

n_fold = 10


def gini(list_of_values):
  sorted_list = sorted(list(list_of_values))
  height, area = 0, 0
  for value in sorted_list:
    height += value
    area += height - value / 2.
  fair_area = height * len(list_of_values) / 2
  return (fair_area - area) / fair_area
  
def normalized_gini(y_pred, y):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini

In [14]:

# Load dataset
trn_data = pd.read_csv('../Kaggle/train.csv', sep=',')
trn_data_X_old = trn_data.loc[:,trn_data.columns != 'Hazard']
trn_data_Y = trn_data.loc[:,'Hazard']
n_row_trn, ncol_trn = trn_data_X_old.shape

tst_data_X_old = pd.read_csv('../Kaggle/test.csv', sep=',')
#tst_data.head()
n_row_tst, n_col_tst = tst_data_X_old.shape


## Convert Categorical variables into dummies

# Concatenate trn and test X dataset
agg_data_X = pd.concat([trn_data_X_old, tst_data_X_old], axis=0)
#print(trn_data_X.shape, tst_data_X.shape, agg_data_X.shape)

for cid in range(len(dum_cols)):
    print('Converting categorical column %s. Number of distinct items : %d' \
          %(dum_cols[cid], len(agg_data_X[dum_cols[cid]].value_counts())))
    just_dummy = pd.get_dummies(agg_data_X[dum_cols[cid]], prefix=dum_cols_names[cid])
    # Drop the last dummy column
    just_dummy.drop(just_dummy.columns[[just_dummy.shape[1]-1]], axis=1, inplace=True)
    print(' - Converted : %d dummy columns are generated.' %(just_dummy.shape[1]))
    # Concatenate dummy columns into dataset
    agg_data_X = pd.concat([agg_data_X, just_dummy], axis=1)
    # Drop the origin column
    agg_data_X.drop(dum_cols[cid], axis=1, inplace=True)

# Split trn and test X dataset
trn_data_X = agg_data_X.iloc[0:n_row_trn, :]
tst_data_X = agg_data_X.iloc[n_row_trn:, :]
#print(trn_data_X_old.shape, trn_data_X.shape, tst_data_X_old.shape, tst_data_X.shape)



Converting categorical column T1_V4. Number of distinct items : 8
 - Converted : 7 dummy columns are generated.
Converting categorical column T1_V5. Number of distinct items : 10
 - Converted : 9 dummy columns are generated.
Converting categorical column T1_V6. Number of distinct items : 2
 - Converted : 1 dummy columns are generated.
Converting categorical column T1_V7. Number of distinct items : 4
 - Converted : 3 dummy columns are generated.
Converting categorical column T1_V8. Number of distinct items : 4
 - Converted : 3 dummy columns are generated.
Converting categorical column T1_V9. Number of distinct items : 6
 - Converted : 5 dummy columns are generated.
Converting categorical column T1_V11. Number of distinct items : 12
 - Converted : 11 dummy columns are generated.
Converting categorical column T1_V12. Number of distinct items : 4
 - Converted : 3 dummy columns are generated.
Converting categorical column T1_V15. Number of distinct items : 8
 - Converted : 7 dummy columns a

In [64]:

# Separating Trn dataset into 10 fold datasets
trn_data_Y_idx = pd.Series(np.zeros(n_row_trn))

trn_data_Y_idx[trn_data_Y == 1] = 1
trn_data_Y_idx[(trn_data_Y > 1) & (trn_data_Y <= 5)] = 5
trn_data_Y_idx[(trn_data_Y > 6) & (trn_data_Y <= 10)] = 10
trn_data_Y_idx[(trn_data_Y > 10) & (trn_data_Y <= 20)] = 20
trn_data_Y_idx[(trn_data_Y > 20) & (trn_data_Y <= 30)] = 30

kf = StratifiedKFold(trn_data_Y_idx, n_folds = n_fold)

t1 = time.time()
lasso_reg(kf, trn_data_X, trn_data_Y, n_fold)
print('Duration %.2f sec' %(time.time() - t1))

#trn_data.loc[3,'Id']
#trn_data[1:10]['Hazard']
#trn_data.loc[1:10,'Hazard':]
#trn_data.iloc[3:10,4:]



Alpha: 0.000010 | RSS: 14.6840  | Variance Score: 0.0918 | NormGini:  0.3454 
Alpha: 0.000050 | RSS: 14.6827  | Variance Score: 0.0918 | NormGini:  0.3451 
Alpha: 0.000090 | RSS: 14.6819  | Variance Score: 0.0919 | NormGini:  0.3448 
Alpha: 0.000100 | RSS: 14.6817  | Variance Score: 0.0919 | NormGini:  0.3448 
Alpha: 0.000200 | RSS: 14.6806  | Variance Score: 0.0920 | NormGini:  0.3440 
Alpha: 0.000300 | RSS: 14.6801  | Variance Score: 0.0920 | NormGini:  0.3434 
Alpha: 0.000500 | RSS: 14.6798  | Variance Score: 0.0920 | NormGini:  0.3422 
Alpha: 0.000700 | RSS: 14.6795  | Variance Score: 0.0920 | NormGini:  0.3410 
Alpha: 0.000900 | RSS: 14.6795  | Variance Score: 0.0920 | NormGini:  0.3400 
Alpha: 0.000100 | RSS: 14.6817  | Variance Score: 0.0919 | NormGini:  0.3448 
Alpha: 0.001000 | RSS: 14.6797  | Variance Score: 0.0920 | NormGini:  0.3395 
Alpha: 0.001500 | RSS: 14.6815  | Variance Score: 0.0919 | NormGini:  0.3371 
Alpha: 0.003000 | RSS: 14.6903  | Variance Score: 0.0914 | NormG



In [49]:
# lss = ['a', 'b', 'c', 'd']
# lst = [a+'_1' for a in lss]
# print(lst)

# print(lss[0], lss[[1,2]])
# print(agg_data_X['T1_V4'].value_counts())
# print(len(agg_data_X['T1_V4'].value_counts()))


# lss.index('a')

param_results = pd.DataFrame(np.empty((0,4), float), columns=['Alpha', 'RSS', 'VarScore', 'NormGini'])
param_results.loc[len(param_results)] = [0.000001] + ['a',2,3]

a =[1,2,3]

print([3] + [ 3,2,4])

[3, 3, 2, 4]


In [63]:
# Lasso linear regression Model
def lasso_reg(kf, trn_data_X, trn_data_Y, n_fold):
    parameters = {'alpha':[0.00001, 0.00005, 0.00009, 0.0001, 0.0002, 0.0003, 0.0005, 0.0007, 0.0009, 0.0001, 0.001, 0.0015, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.05, 0.1, 0.15]}
    param_results = pd.DataFrame(np.empty((0,4), float), columns=['Alpha', 'RSS', 'VarScore', 'NormGini'])
    
    # Training Predictive models
    for alpha_item in parameters['alpha']:
        clf = linear_model.Lasso(alpha = alpha_item, max_iter=2000)
        cv_out = np.empty((0,3), float)
        val_idx = 1
        for train, test in kf:
            #print('Fold %d. Training : Validation = %d : %d' %(val_idx, len(train), len(test)))
            clf.fit(trn_data_X.loc[train, :], trn_data_Y.loc[train])
            trn_data_predY = clf.predict(trn_data_X.loc[test, :])
            # mean square error
            
            #print(trn_data_predY.shape, trn_data_Y.loc[test].shape)
            rss                = np.mean((trn_data_predY - trn_data_Y.loc[test]) ** 2)
            varscore = clf.score(trn_data_X.loc[test, :],  trn_data_Y.loc[test])
            norm_gini =   normalized_gini(trn_data_predY,  trn_data_Y.loc[test])
            # Explained variance score: 1 is perfect prediction
            #print("RSS: %.2f  | Variance Score: %.2f | NormGini:  %.2f " %(rss, varscore, norm_gini))
            
            cv_out = np.append(cv_out, np.array([[rss, varscore, norm_gini]]), axis=0)
            val_idx = val_idx + 1
        
        avg_cv_out = cv_out.mean(axis=0)
        print('Alpha: %f | RSS: %.4f  | Variance Score: %.4f | NormGini:  %.4f '\
              %(alpha_item, avg_cv_out[0], avg_cv_out[1], avg_cv_out[2]))
        #print(np.insert(avg_cv_out, 0, alpha_item), [alpha_item], [alpha_item] + avg_cv_out, len([alpha_item] + avg_cv_out), param_results.shape)
        param_results.loc[len(param_results)] = np.insert(avg_cv_out, 0, alpha_item)
    
    best_param = param_results.idxmin()['NormGini']
    
    # Prediction
    
#     parameters = {'alpha':[0.01, 0.5]}
#     print('here?')
#     lsm = linear_model.Lasso()
#     clf = grid_search.GridSearchCV(lsm, parameters, cv=n_fold, n_jobs=2)
#     clf.fit(trn_data_X, trn_data_Y)
#     print(clf.best_params_, clf.best_score_, clf.best_estimator_)
#     print(clf.grid_scores_)

In [56]:

arr = np.empty((0,3), int)
arr = np.append(arr, np.array([[1,2,3]]), axis=0)
arr = np.append(arr, np.array([[4,5,6]]), axis=0)
print(arr)
print(arr.mean(axis=0))

np.insert(arr.mean(axis=0), 0, 0)

[[1 2 3]
 [4 5 6]]
[ 2.5  3.5  4.5]


array([ 0. ,  2.5,  3.5,  4.5])

In [258]:
param_results = pd.DataFrame(np.empty((0,3), float), columns=['Alpha', 'RSS', 'VarScore'])
param_results.loc[len(param_results)] = [1,2,3]
param_results.loc[len(param_results)] = [32423,2,3]
print(param_results)

param_results.idxmin()['RSS']

   Alpha  RSS  VarScore
0      1    2         3
1  32423    2         3


0