In [1]:
%matplotlib inline
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import math

In [43]:
def get_info_D(n, feature):
    info_D = 0
    feature_counts = Counter(feature)
    for val in feature_counts:
        p_i = feature_counts[val]/n
        info_D -= p_i*math.log(p_i, 2)
    return info_D

def get_gini_D(n, feature):
    gini_D = 1
    feature_counts = Counter(feature)
    for val in feature_counts:
        p_i = feature_counts[val]/n
        gini_D -= math.pow(p_i, 2)
    return gini_D

def get_info_gains(n, df):
    # Info(D) and Gini(D)
    class_data = df[df.columns[-1]]
    info_D = get_info_D(n, class_data)
    gini_D = get_gini_D(n, class_data)
    
    # Info_class(D), Split_Info(D), and Gini_class(D)
    feature_info_gain = []
    feature_split_info = []
    feature_gini_indicies = []
    
    # Loop over attributes
    for i in range(len(df.columns)-1):
        # Create summation bases
        feature_info = 0
        split_info = 0
        feature_gini = 0
        
        # Group by subsections
        col = df.groupby(df[df.columns[i]])[df.columns[-1]]
        for key in col: # key[0] = key, key[1] = table of key vs class
            # Extract number of elements in subsection
            num_keys = len(key[1])
            
            # Calculate respective info
            split_info -= num_keys/n * math.log(num_keys/n, 2)
            feature_info += num_keys/n * (get_info_D(num_keys, key[1]))
            feature_gini += num_keys/n * (get_gini_D(num_keys, key[1]))
            
        # Append into respective array index
        feature_info_gain.append(feature_info)
        feature_split_info.append(split_info)
        feature_gini_indicies.append(feature_gini)
        
    # Gain(class), 
    Gain_D = [info_D - info_A_D for info_A_D in feature_info_gain]
    delta_gini_D = [gini_D - gini_A_D for gini_A_D in feature_gini_indicies]
    
    # Return attribute info arrays
    return Gain_D, feature_split_info, feature_gini_indicies

def calc_gain_ratio(info_gains, split_info):
    if len(info_gains) != len(split_info):
        print("ERROR")
    else:
        feature_gain_ratio = []
        for i in range(len(info_gains)):
            feature_gain_ratio.append(info_gains[i]/split_info[i])
            
    return feature_gain_ratio

def tup_index_val_list(info):
    info_tups = []
    for e in range(len(info)):
        tup = (e, info[e])
        info_tups.append(tup)
    return info_tups

def output(df, info_gains, gain_ratios, gini_indicies):
    print("Best splits")
    max_info_gain = info_gains.index(max(info_gains))
    max_gain_ratio = gain_ratios.index(max(gain_ratios))
    min_gini_index = gini_indicies.index(min(gini_indicies))
    
    print(df.columns[max_info_gain])
    print(df.columns[max_gain_ratio])
    print(df.columns[min_gini_index])
    
    print()
    print("Worst Splits")
    min_info_gain = info_gains.index(min(info_gains))
    min_gain_ratio = gain_ratios.index(min(gain_ratios))
    max_gini_index = gini_indicies.index(max(gini_indicies))
    
    print(df.columns[min_info_gain])
    print(df.columns[min_gain_ratio])
    print(df.columns[max_gini_index])

    print()
    print("Sorted")
    info_gain_tups = tup_index_val_list(info_gains)
    gain_ratio_tups = tup_index_val_list(gain_ratios)
    gini_index_tups = tup_index_val_list(gini_indicies)
    
    info_gain_tups.sort(key=lambda tup: -tup[1])
    gain_ratio_tups.sort(key=lambda tup: -tup[1])
    gini_index_tups.sort(key=lambda tup: tup[1])
    
    print("Top 5")
    for i in range(5):
        print()
        print(df.columns[info_gain_tups[i][0]], info_gain_tups[i][1])
        print(df.columns[gain_ratio_tups[i][0]], gain_ratio_tups[i][1])
        print(df.columns[gini_index_tups[i][0]], gini_index_tups[i][1])
    
    print()
    print("Bottom 5")
    for i in range(1, 6):
        print()
        print(df.columns[info_gain_tups[-i][0]], info_gain_tups[-i][1])
        print(df.columns[gain_ratio_tups[-i][0]], gain_ratio_tups[-i][1])
        print(df.columns[gini_index_tups[-i][0]], gini_index_tups[-i][1])

In [8]:
trainlib = pd.read_csv("training.csv")
testlib = pd.read_csv("testing.csv")

# Combine libraries for data processing
all_data = trainlib.append(testlib)

# New variables taken from previous kaggle competition
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]
all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
all_data['Product_Info_2_char'] = pd.factorize(all_data['Product_Info_2_char'])[0]
all_data['Product_Info_2_num'] = pd.factorize(all_data['Product_Info_2_num'])[0]
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']
med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)
all_data.apply(lambda x: sum(x.isnull()),1)
all_data['countna'] = all_data.apply(lambda x: sum(x.isnull()),1)
all_data.fillna(-1, inplace=True)
all_data['Response'] = all_data['Response'].astype(int)

train_ohd = all_data[all_data['Response']>0].copy()
test_ohd = all_data[all_data['Response']<1].copy()

In [14]:
# Set response column as last column in df
cols = list(train_ohd.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('Response')) #Remove response from list
train_ohd = train_ohd[cols+['Response']]

In [15]:
# Separate out the response column
target = train_ohd["Response"]
train_db = train_ohd.drop(["Response"], axis=1)
test_db = test_ohd.drop(["Response"], axis=1)

In [41]:
n = len(target)

In [44]:
info_gains, split_info, gini_indicies = get_info_gains(n, train_ohd)
gain_ratios = calc_gain_ratio(info_gains, split_info)

output(train_ohd, info_gains, gain_ratios, gini_indicies)

Best splits
Id
Id
Id

Worst Splits
Medical_Keyword_44
Medical_Keyword_44
Medical_Keyword_45

Sorted
Top 5

Id 2.616780065604559
Id 0.18314898817189662
Id 0.0

BMI_Age 2.0528668717869367
BMI_Age 0.16090265225775663
BMI_Age 0.22118787455003652

BMI 0.7256017129926122
Medical_History_32 0.139683306354558
BMI 0.6424086001325987

Medical_History_15 0.25908939965983446
Medical_History_15 0.1196713313489524
Wt 0.7412373890626542

Wt 0.2580818055240641
Medical_Keyword_15 0.10749323919147963
Product_Info_4 0.7622367087289642

Bottom 5

Medical_Keyword_44 0.0002814021748429596
Medical_Keyword_44 0.0008035558175679701
Medical_Keyword_45 0.8072266433020672

Medical_Keyword_45 0.00035586226958050915
Medical_Keyword_45 0.0009186974884695719
Medical_Keyword_44 0.8072168659545849

Medical_Keyword_29 0.0005580852483113574
Medical_History_36 0.0011773692867803418
Medical_Keyword_6 0.8072015921526283

Medical_Keyword_32 0.0006401854305648769
Medical_History_26 0.0013117311465806825
Medical_Keyword_41 0.8

In [47]:
# Linear Regression
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(train_db, target)

# Make predictions using the testing set
pred = regr.predict(test_db)
print(pred)

[7.36567285 6.08071657 7.22221344 ... 4.83641799 6.08147473 5.19269468]


In [48]:
def specify(x1,x2,x3,x4,x5,x6,x7, arr):
    '''
    Digitize train list
    '''
    res = []
    for y in arr:
        if y < x1:
            res.append(1)
        elif y < x2:
            res.append(2)
        elif y < x3:
            res.append(3)
        elif y < x4:
            res.append(4)
        elif y < x5:
            res.append(5)
        elif y < x6:
            res.append(6)
        elif y < x7:
            res.append(7)
        else: res.append(8)
    return res 

In [49]:
# Default cutoffs
preds = specify(1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, pred)

In [50]:
# Custom cutoffs from previous kaggle submission
preds = specify(1.5,2.9,3.1,4.5,5.5,6.1,7.1, pred)

In [51]:
# Create submission csv
res = pd.DataFrame()
id_arr = [x for x in range(20000, len(preds)+20000)]
res["Id"] = id_arr
res.set_index("Id")
res["Response"] = preds
res.to_csv("predictions.csv", index=False)
temp = pd.read_csv("predictions.csv")
print(temp)

         Id  Response
0     20000         8
1     20001         6
2     20002         8
3     20003         7
4     20004         5
5     20005         6
6     20006         7
7     20007         4
8     20008         5
9     20009         5
10    20010         6
11    20011         5
12    20012         7
13    20013         5
14    20014         6
15    20015         5
16    20016         7
17    20017         7
18    20018         5
19    20019         5
20    20020         5
21    20021         5
22    20022         7
23    20023         8
24    20024         5
25    20025         4
26    20026         7
27    20027         8
28    20028         7
29    20029         8
...     ...       ...
9970  29970         7
9971  29971         7
9972  29972         6
9973  29973         8
9974  29974         4
9975  29975         5
9976  29976         8
9977  29977         6
9978  29978         7
9979  29979         8
9980  29980         7
9981  29981         7
9982  29982         7
9983  2998