In [2]:
%matplotlib inline
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import math
import random
from random import randint

In [45]:
# HW 5 Index analyses
def get_info_D(n, feature):
    info_D = 0
    feature_counts = Counter(feature)
    for val in feature_counts:
        p_i = feature_counts[val]/n
        info_D -= p_i*math.log(p_i, 2)
    return info_D

def get_gini_D(n, feature):
    gini_D = 1
    feature_counts = Counter(feature)
    for val in feature_counts:
        p_i = feature_counts[val]/n
        gini_D -= math.pow(p_i, 2)
    return gini_D

def get_info_gains(n, df):
    # Info(D) and Gini(D)
    class_data = df[df.columns[-1]]
    info_D = get_info_D(n, class_data)
    gini_D = get_gini_D(n, class_data)
    
    # Info_class(D), Split_Info(D), and Gini_class(D)
    feature_info_gain = []
    feature_split_info = []
    feature_gini_indicies = []
    
    # Loop over attributes
    for i in range(len(df.columns)-1):
        # Create summation bases
        feature_info = 0
        split_info = 0
        feature_gini = 0
        
        # Group by subsections
        col = df.groupby(df[df.columns[i]])[df.columns[-1]]
        for key in col: # key[0] = key, key[1] = table of key vs class
            # Extract number of elements in subsection
            num_keys = len(key[1])
            
            # Calculate respective info
            split_info -= num_keys/n * math.log(num_keys/n, 2)
            feature_info += num_keys/n * (get_info_D(num_keys, key[1]))
            feature_gini += num_keys/n * (get_gini_D(num_keys, key[1]))
            
        # Append into respective array index
        feature_info_gain.append(feature_info)
        feature_split_info.append(split_info)
        feature_gini_indicies.append(feature_gini)
        
    # Gain(class), 
    Gain_D = [info_D - info_A_D for info_A_D in feature_info_gain]
    delta_gini_D = [gini_D - gini_A_D for gini_A_D in feature_gini_indicies]
    
    # Return attribute info arrays
    return Gain_D, feature_split_info, feature_gini_indicies

def calc_gain_ratio(info_gains, split_info):
    if len(info_gains) != len(split_info):
        print("ERROR")
    else:
        feature_gain_ratio = []
        for i in range(len(info_gains)):
            feature_gain_ratio.append(info_gains[i]/split_info[i])
            
    return feature_gain_ratio

def tup_index_val_list(info):
    info_tups = []
    for e in range(len(info)):
        tup = (e, info[e])
        info_tups.append(tup)
    return info_tups

def output(df, info_gains, gain_ratios, gini_indicies):
    print("Best splits")
    max_info_gain = info_gains.index(max(info_gains))
    max_gain_ratio = gain_ratios.index(max(gain_ratios))
    min_gini_index = gini_indicies.index(min(gini_indicies))
    
    print(df.columns[max_info_gain])
    print(df.columns[max_gain_ratio])
    print(df.columns[min_gini_index])
    
    print()
    print("Worst Splits")
    min_info_gain = info_gains.index(min(info_gains))
    min_gain_ratio = gain_ratios.index(min(gain_ratios))
    max_gini_index = gini_indicies.index(max(gini_indicies))
    
    print(df.columns[min_info_gain])
    print(df.columns[min_gain_ratio])
    print(df.columns[max_gini_index])

    print()
    print("Sorted")
    info_gain_tups = tup_index_val_list(info_gains)
    gain_ratio_tups = tup_index_val_list(gain_ratios)
    gini_index_tups = tup_index_val_list(gini_indicies)
    
    info_gain_tups.sort(key=lambda tup: -tup[1])
    gain_ratio_tups.sort(key=lambda tup: -tup[1])
    gini_index_tups.sort(key=lambda tup: tup[1])
    
    print("Top 5")
    for i in range(5):
        print()
        print(df.columns[info_gain_tups[i][0]], info_gain_tups[i][1])
        print(df.columns[gain_ratio_tups[i][0]], gain_ratio_tups[i][1])
        print(df.columns[gini_index_tups[i][0]], gini_index_tups[i][1])
    
    print()
    print("Bottom 5")
    for i in range(1, 6):
        print()
        print(df.columns[info_gain_tups[-i][0]], info_gain_tups[-i][1])
        print(df.columns[gain_ratio_tups[-i][0]], gain_ratio_tups[-i][1])
        print(df.columns[gini_index_tups[-i][0]], gini_index_tups[-i][1])

In [218]:
trainlib = pd.read_csv("training.csv")
testlib = pd.read_csv("testing.csv")

# Combine libraries for data processing
all_data = trainlib.append(testlib)

# Split Product Info 2
all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]


categorical_cols =["Product_Info_1","Product_Info_2","Product_Info_3","Product_Info_5","Product_Info_6","Product_Info_7","Employment_Info_2","Employment_Info_3","Employment_Info_5","InsuredInfo_1","InsuredInfo_2","InsuredInfo_3","InsuredInfo_4","InsuredInfo_5","InsuredInfo_6","InsuredInfo_7","Insurance_History_1","Insurance_History_2","Insurance_History_3","Insurance_History_4","Insurance_History_7","Insurance_History_8","Insurance_History_9","Family_Hist_1","Medical_History_2","Medical_History_3","Medical_History_4","Medical_History_5","Medical_History_6","Medical_History_7","Medical_History_8","Medical_History_9","Medical_History_11","Medical_History_12","Medical_History_13","Medical_History_14","Medical_History_16","Medical_History_17","Medical_History_18","Medical_History_19","Medical_History_20","Medical_History_21","Medical_History_22","Medical_History_23","Medical_History_25","Medical_History_26","Medical_History_27","Medical_History_28","Medical_History_29","Medical_History_30","Medical_History_31","Medical_History_33","Medical_History_34","Medical_History_35","Medical_History_36","Medical_History_37","Medical_History_38","Medical_History_39","Medical_History_40","Medical_History_41","Product_Info_2_char","Product_Info_2_num"]
for i in categorical_cols:
    all_data[i] = pd.factorize(all_data[i])[0]

# Add BMI_Age
all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']

# Drop unnecessary columns
all_data.drop(['Product_Info_2', 'Medical_Keyword_44', 'Medical_Keyword_45'], axis=1, inplace=True)

# Add Med Keyword Count
med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)
# Fill N/A with 0
for key in med_keyword_columns:
    all_data[key].fillna(0, inplace=True)
all_data['countna'] = all_data.apply(lambda x: sum(x.isnull()),1)

# Replace N/A with mean/median/mode
columns = all_data.columns
for col in columns:
    if col not in med_keyword_columns and col != 'Response':
        #fill = np.mode(all_data(col))
        fill = all_data[col].median()
        #fill = np.mean(all_data[col])
        all_data[col].fillna(fill, inplace=True)
all_data.fillna(0, inplace=True)

all_data['Response'] = all_data['Response'].astype(int)

train_ohd = all_data[all_data['Response']>0].copy()
test_ohd = all_data[all_data['Response']<1].copy()

In [240]:
# Set response column as last column in df
cols = list(train_ohd.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('Response')) #Remove response from list
train_ohd = train_ohd[cols+['Response']]
drop_cols = ["Id", "Response","Product_Info_1","Product_Info_3","Product_Info_5","Product_Info_6","Product_Info_7","Employment_Info_2","Employment_Info_3","Employment_Info_5","InsuredInfo_1","InsuredInfo_2","InsuredInfo_3","InsuredInfo_4","InsuredInfo_5","InsuredInfo_6","InsuredInfo_7","Insurance_History_1","Insurance_History_2","Insurance_History_3","Insurance_History_4","Insurance_History_7","Insurance_History_8","Insurance_History_9","Family_Hist_1","Medical_History_2","Medical_History_3","Medical_History_4","Medical_History_5","Medical_History_6","Medical_History_7","Medical_History_8","Medical_History_9","Medical_History_11","Medical_History_12","Medical_History_13","Medical_History_14","Medical_History_16","Medical_History_17","Medical_History_18","Medical_History_19","Medical_History_20","Medical_History_21","Medical_History_22","Medical_History_23","Medical_History_25","Medical_History_26","Medical_History_27","Medical_History_28","Medical_History_29","Medical_History_30","Medical_History_31","Medical_History_33","Medical_History_34","Medical_History_35","Medical_History_36","Medical_History_37","Medical_History_38","Medical_History_39","Medical_History_40","Medical_History_41"]


# Separate out the response column
target = train_ohd["Response"]
#train_db = train_ohd.drop(["Response"], axis=1)
train_db = train_ohd.drop(drop_cols, axis=1)
rand_db = train_ohd[['Product_Info_2_char', 'Product_Info_2_num', 'BMI_Age', 'Med_Keywords_Count', 'countna', 'Wt', 'Ht']]
#test_db = test_ohd.drop(["Response"], axis=1)
test_db = test_ohd.drop(drop_cols, axis=1)
rand_tdb = test_ohd[['Product_Info_2_char', 'Product_Info_2_num', 'BMI_Age', 'Med_Keywords_Count', 'countna', 'Wt', 'Ht']]

In [238]:
rand_db.columns

Index(['Product_Info_2_char', 'Product_Info_2_num', 'BMI_Age',
       'Med_Keywords_Count', 'countna', 'Wt', 'Ht'],
      dtype='object')

In [46]:
# Run HW 5 on dataset
n = len(target)
info_gains, split_info, gini_indicies = get_info_gains(n, train_ohd)
gain_ratios = calc_gain_ratio(info_gains, split_info)

output(train_ohd, info_gains, gain_ratios, gini_indicies)

Best splits
Id
Medical_Keyword_3
Id

Worst Splits
Medical_Keyword_32
Medical_History_36
Medical_Keyword_32

Sorted
Top 5

Id 2.616780065604559
Medical_Keyword_3 0.1906185589363647
Id 0.0

BMI_Age 2.0527254132274284
Id 0.18314898817189662
BMI_Age 0.22121527251823342

BMI 0.72513926119984
BMI_Age 0.16091193602645032
BMI 0.6424547775536509

Medical_History_15 0.2589331477136225
Medical_Keyword_15 0.14802801787481362
Wt 0.7412999317087804

Wt 0.2577358760578501
Medical_History_32 0.14555941176345302
Product_Info_4 0.7623117638758649

Bottom 5

Medical_Keyword_32 0.00023605662678249928
Medical_History_36 0.0011773692867803416
Medical_Keyword_32 0.8072536632329728

Medical_Keyword_39 0.0003019036012936738
Medical_History_26 0.0013117311465806823
Medical_Keyword_41 0.8072478705843513

Medical_Keyword_6 0.00032528447829882623
Medical_History_25 0.0015938293530292657
Medical_Keyword_6 0.8072393125334555

Medical_Keyword_41 0.00037627348239199776
Medical_Keyword_32 0.0016623513408496968
Medical_

In [241]:
# SKLearn Logistic Regression
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LogisticRegression(C=1e5)

# Train the model using the training sets
#regr.fit(train_db, target)
regr.fit(rand_db, target)

# Make predictions using the testing set
#new_pred = regr.predict(test_db)
new_pred = regr.predict(rand_tdb)
Counter(new_pred)

Counter({1: 659, 2: 316, 5: 474, 6: 2332, 7: 13, 8: 6206})

In [162]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2, svd_solver='full')
train_db = pca.fit_transform(train_db)
test_db = pca.fit_transform(test_db)

In [122]:
for p in range(len(preds)):
    if preds[p] != new_pred[p]:
        print(p, preds[p], new_pred[p])

5 6 8
6 7 8
7 1 8
8 1 8
11 7 8
13 6 8
15 1 8
16 1 8
18 1 8
19 7 8
20 6 8
21 7 8
24 7 8
25 6 8
27 2 8
32 6 8
33 6 8
34 2 8
35 5 8
38 7 8
39 6 8
40 6 8
43 2 8
44 6 8
45 1 8
47 6 8
49 6 8
51 6 8
52 6 8
53 1 8
54 6 8
55 6 8
57 6 8
58 5 8
59 1 8
62 6 8
66 6 8
69 6 8
71 2 8
73 7 8
75 6 8
76 2 8
77 6 8
82 5 8
84 1 8
86 6 8
88 1 8
89 5 8
90 6 8
91 1 8
92 1 8
93 6 8
94 7 8
100 7 8
101 1 8
103 5 8
106 5 8
108 1 8
109 2 8
111 6 8
113 6 8
114 5 8
115 6 8
116 1 8
117 1 8
118 7 8
119 6 8
120 7 8
121 1 8
122 6 8
124 1 8
125 6 8
127 2 8
128 6 8
130 7 8
131 6 8
132 2 8
133 1 8
137 6 8
138 6 8
139 5 8
140 6 8
141 1 8
142 6 8
143 1 8
144 6 8
147 5 8
148 1 8
150 2 8
152 6 8
153 1 8
154 1 8
155 6 8
158 2 8
159 1 8
160 1 8
163 6 8
165 6 8
166 6 8
168 7 8
170 6 8
171 5 8
173 7 8
174 6 8
175 5 8
177 6 8
178 1 8
179 2 8
180 6 8
184 2 8
185 6 8
187 7 8
188 6 8
189 2 8
190 5 8
191 6 8
193 2 8
196 1 8
197 7 8
198 7 8
199 6 8
200 6 8
203 6 8
204 6 8
205 6 8
208 5 8
209 7 8
210 6 8
211 2 8
214 6 8
215 7 8
216 2 8
2

1778 7 8
1779 6 8
1780 7 8
1781 5 8
1782 7 8
1783 6 8
1784 1 8
1785 6 8
1786 7 8
1787 2 8
1790 6 8
1791 1 8
1793 7 8
1794 6 8
1795 5 8
1796 1 8
1797 7 8
1799 7 8
1803 7 8
1804 5 8
1805 6 8
1807 2 8
1809 6 8
1810 6 8
1812 5 8
1813 2 8
1815 7 8
1819 1 8
1821 1 8
1822 5 8
1824 2 8
1825 6 8
1827 6 8
1828 1 8
1829 2 8
1832 1 8
1833 1 8
1835 2 8
1840 6 8
1844 1 8
1849 2 8
1853 6 8
1854 7 8
1856 7 8
1858 2 8
1859 6 8
1860 7 8
1861 4 8
1862 7 8
1864 6 8
1865 6 8
1866 5 8
1867 1 8
1871 6 8
1872 6 8
1874 5 8
1875 1 8
1876 1 8
1877 2 8
1879 6 8
1880 1 8
1883 5 8
1886 2 8
1887 7 8
1888 5 8
1893 6 8
1894 2 8
1895 1 8
1896 6 8
1898 2 8
1900 2 8
1901 6 8
1902 1 8
1904 6 8
1905 7 8
1906 1 8
1907 1 8
1909 7 8
1911 5 8
1912 6 8
1913 6 8
1914 6 8
1915 6 8
1916 7 8
1918 6 8
1921 5 8
1922 7 8
1924 5 8
1925 6 8
1926 3 8
1927 6 8
1929 2 8
1930 1 8
1935 6 8
1936 5 8
1938 6 8
1940 1 8
1941 7 8
1942 6 8
1943 5 8
1944 2 8
1945 7 8
1948 1 8
1949 2 8
1950 5 8
1951 2 8
1952 7 8
1954 6 8
1957 5 8
1959 7 8
1960 1 8
1

3594 7 8
3595 6 8
3596 5 8
3597 2 8
3598 7 8
3601 1 8
3602 6 8
3605 6 8
3606 6 8
3607 7 8
3608 6 8
3610 7 8
3611 6 8
3615 1 8
3617 6 8
3618 2 8
3619 1 8
3621 6 8
3622 6 8
3623 5 8
3625 2 8
3629 6 8
3630 6 8
3631 6 8
3632 2 8
3634 2 8
3635 6 8
3636 5 8
3638 1 8
3641 6 8
3642 1 8
3643 1 8
3645 1 8
3646 1 8
3647 5 8
3650 7 8
3651 6 8
3652 2 8
3653 2 8
3654 7 8
3656 2 8
3657 6 8
3663 5 8
3664 2 8
3665 5 8
3666 6 8
3667 6 8
3669 5 8
3670 1 8
3671 6 8
3672 2 8
3673 6 8
3674 7 8
3675 6 8
3677 2 8
3678 1 8
3679 5 8
3680 7 8
3681 6 8
3683 5 8
3684 2 8
3685 6 8
3686 1 8
3688 6 8
3689 6 8
3690 5 8
3691 6 8
3693 5 8
3694 6 8
3695 2 8
3697 1 8
3698 7 8
3699 4 8
3701 7 8
3702 1 8
3703 6 8
3704 1 8
3705 7 8
3706 2 8
3707 6 8
3708 7 8
3709 6 8
3710 6 8
3711 5 8
3713 6 8
3714 2 8
3715 1 8
3718 7 8
3719 6 8
3720 5 8
3721 7 8
3722 6 8
3723 6 8
3725 6 8
3726 5 8
3727 2 8
3728 6 8
3729 7 8
3730 2 8
3733 7 8
3734 6 8
3738 2 8
3740 1 8
3743 6 8
3744 6 8
3748 2 8
3749 1 8
3750 7 8
3752 5 8
3753 2 8
3756 6 8
3

5531 5 8
5533 5 8
5534 1 8
5537 5 8
5538 6 8
5540 6 8
5541 6 8
5542 1 8
5543 6 8
5544 5 8
5545 7 8
5546 1 8
5547 7 8
5548 7 8
5550 2 8
5552 6 8
5553 7 8
5554 6 8
5555 6 8
5557 6 8
5558 6 8
5561 6 8
5564 2 8
5566 5 8
5567 6 8
5570 6 8
5574 5 8
5577 7 8
5580 6 8
5581 7 8
5582 2 8
5584 6 8
5586 1 8
5588 5 8
5589 6 8
5590 1 8
5591 2 8
5592 1 8
5593 2 8
5594 2 8
5596 1 8
5598 7 8
5601 6 8
5604 1 8
5605 7 8
5607 5 8
5608 6 8
5610 5 8
5611 6 8
5612 1 8
5613 2 8
5614 5 8
5616 6 8
5620 7 8
5623 2 8
5624 2 8
5626 6 8
5627 1 8
5630 2 8
5631 1 8
5633 1 8
5636 1 8
5638 6 8
5639 5 8
5641 2 8
5642 2 8
5643 6 8
5645 2 8
5648 7 8
5649 2 8
5650 6 8
5652 6 8
5657 6 8
5662 6 8
5664 6 8
5665 2 8
5666 1 8
5667 7 8
5669 2 8
5671 7 8
5673 7 8
5675 6 8
5678 6 8
5679 6 8
5680 6 8
5681 2 8
5682 2 8
5683 6 8
5684 6 8
5685 6 8
5688 6 8
5691 5 8
5695 7 8
5697 2 8
5700 1 8
5702 1 8
5704 7 8
5706 6 8
5709 7 8
5712 5 8
5713 5 8
5714 6 8
5716 3 8
5717 7 8
5718 2 8
5719 6 8
5722 6 8
5724 2 8
5725 2 8
5726 7 8
5727 5 8
5

7616 1 8
7618 6 8
7619 6 8
7620 6 8
7625 7 8
7626 6 8
7629 1 8
7630 1 8
7631 6 8
7632 6 8
7633 5 8
7639 6 8
7640 1 8
7643 1 8
7647 2 8
7648 5 8
7650 2 8
7651 6 8
7652 7 8
7654 2 8
7656 6 8
7658 6 8
7659 5 8
7660 2 8
7661 7 8
7663 1 8
7664 2 8
7666 6 8
7667 6 8
7668 7 8
7669 1 8
7673 5 8
7674 6 8
7675 5 8
7676 1 8
7677 6 8
7680 5 8
7683 1 8
7686 1 8
7687 5 8
7688 7 8
7689 6 8
7690 7 8
7691 7 8
7699 5 8
7700 2 8
7701 3 8
7702 5 8
7703 7 8
7707 6 8
7710 6 8
7711 2 8
7713 2 8
7714 6 8
7715 7 8
7716 6 8
7717 2 8
7718 6 8
7720 7 8
7721 1 8
7722 1 8
7723 6 8
7724 5 8
7725 6 8
7726 5 8
7728 1 8
7729 2 8
7730 1 8
7731 6 8
7732 6 8
7734 1 8
7736 5 8
7738 7 8
7739 7 8
7741 6 8
7742 2 8
7743 6 8
7745 6 8
7746 6 8
7747 1 8
7748 6 8
7751 6 8
7752 6 8
7753 1 8
7754 1 8
7757 1 8
7759 5 8
7760 5 8
7761 1 8
7763 7 8
7765 6 8
7766 5 8
7769 6 8
7770 6 8
7774 7 8
7775 6 8
7776 2 8
7777 6 8
7779 7 8
7780 6 8
7782 5 8
7785 6 8
7787 6 8
7788 6 8
7789 2 8
7790 6 8
7791 6 8
7793 6 8
7794 6 8
7795 6 8
7797 6 8
7

9216 7 8
9219 6 8
9225 6 8
9227 1 8
9228 5 8
9231 7 8
9233 6 8
9235 6 8
9236 6 8
9237 2 8
9238 5 8
9239 5 8
9240 6 8
9241 6 8
9244 7 8
9245 5 8
9246 1 8
9247 1 8
9248 5 8
9249 7 8
9250 6 8
9251 1 8
9253 5 8
9254 6 8
9255 7 8
9258 2 8
9259 5 8
9261 6 8
9263 5 8
9264 7 8
9265 2 8
9268 6 8
9269 6 8
9270 6 8
9271 1 8
9272 7 8
9273 2 8
9276 1 8
9278 6 8
9279 2 8
9280 2 8
9283 6 8
9285 6 8
9288 6 8
9290 6 8
9291 6 8
9292 6 8
9293 1 8
9294 6 8
9296 7 8
9297 6 8
9298 6 8
9299 6 8
9301 1 8
9302 7 8
9303 2 8
9304 7 8
9305 6 8
9306 6 8
9307 6 8
9308 6 8
9310 2 8
9311 2 8
9313 6 8
9314 1 8
9317 2 8
9319 6 8
9320 1 8
9321 6 8
9322 2 8
9323 1 8
9325 5 8
9326 1 8
9328 7 8
9329 6 8
9330 6 8
9331 2 8
9332 7 8
9333 5 8
9335 6 8
9336 5 8
9337 2 8
9340 6 8
9341 6 8
9343 4 8
9344 7 8
9345 6 8
9346 6 8
9347 7 8
9348 1 8
9349 6 8
9351 2 8
9353 5 8
9354 6 8
9357 6 8
9359 6 8
9360 6 8
9364 7 8
9365 7 8
9367 7 8
9368 2 8
9371 1 8
9374 6 8
9375 7 8
9377 6 8
9379 6 8
9381 6 8
9383 6 8
9384 7 8
9385 6 8
9386 6 8
9

In [104]:
# Create submission csv
preds = new_pred
res = pd.DataFrame()
id_arr = [x for x in range(20000, len(preds)+20000)]
res["Id"] = id_arr
res.set_index("Id")
res["Response"] = preds
res.to_csv("predictions.csv", index=False)
temp = pd.read_csv("predictions.csv")
#print(temp)

In [276]:
# Logistic Regression One vs Rest, requires one-hot encoded target values
class LogisticRegressionOVR(object):
    def __init__(self, eta=0.1, n_iter=50):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1)
        self.w = []
        self.b = []
        m = X.shape[0]

        for i in np.unique(y):
            y_copy = np.where(y == i, 1, 0)
            w = np.zeros(X.shape[1])

            for _ in range(self.n_iter):
                output = X.dot(w)
                errors = y_copy - self._sigmoid(output)
                w += self.eta / m * errors.dot(X)
            self.w.append((w, i))
        return self
    
    def _predict_one(self, x):
        return max((x.dot(w), c) for w, c in self.w)[1]
    
    def predict(self, X):
        return [self._predict_one(i) for i in np.insert(X, 0, 1, axis=1)]

    def score(self, X, y):
        return sum(self.predict(X) == y) / len(y)

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def test(self):
        x = np.array([[1,1,1,0,0,0],
              [1,0,1,0,0,0],
              [1,1,1,0,0,0],
              [0,0,1,1,1,0],
              [0,0,1,1,0,0],
              [0,0,1,1,1,0]])
        y = np.array([1,1,1,0,0,0])
        
        clf = self.fit(x,y)
        x = np.array([[1, 1, 0, 0, 0, 0]])
        print(clf.predict(x))


In [277]:
model = LogisticRegressionOVR(eta=0.00000001, n_iter=500)
#clf = model.fit(train_db.as_matrix(), target.as_matrix())
clf = model.fit(rand_db.as_matrix(), target.as_matrix())
#res = clf.predict(test_db.as_matrix())
res = clf.predict(rand_tdb.as_matrix())
LogisticRegressionOVR(n_iter=200).test()
Counter(res)

[1]


Counter({8: 10000})

In [252]:
# Logistic Regression One vs Rest, requires one-hot encoded target values

# Sources:
# https://gist.github.com/yusugomori/4462221
# http://deeplearning.net/tutorial/logreg.html#logreg

#numpy.seterr(all='ignore')
def sigmoid(x):
    return 1. / (1 + np.exp(-x))

def softmax(x):
    e = np.exp(x - np.max(x))  # prevent overflow
    if e.ndim == 1:
        return e / np.sum(e, axis=0)
    else:  
        return e / np.array([np.sum(e, axis=1)]).T  # ndim = 2


class LogisticRegression(object):
    def __init__(self, input, label, n_in, n_out):
        self.x = input
        self.y = label
        self.W = np.zeros((n_in, n_out))  # initialize W 0
        #self.W = np.random.uniform(-0.0005,0.0005,(n_in,n_out))
        self.b = np.zeros(n_out)          # initialize bias 0

        # self.params = [self.W, self.b]

    def train(self, lr=0.1, input=None, L2_reg=0.00):
        if input is not None:
            self.x = input

        #p_y_given_x = sigmoid(np.dot(self.x, self.W) + self.b)
        p_y_given_x = softmax(np.dot(self.x, self.W) + self.b)
        d_y = self.y - p_y_given_x
        
        self.W += lr * np.dot(self.x.T, d_y) - lr * L2_reg * self.W
        self.b += lr * np.mean(d_y, axis=0)
        
        # cost = self.negative_log_likelihood()
        # return cost

    def negative_log_likelihood(self):
        #sigmoid_activation = sigmoid(np.dot(self.x, self.W) + self.b)
        sigmoid_activation = softmax(np.dot(self.x, self.W) + self.b)

        cross_entropy = - np.mean(np.sum(self.y * np.log(sigmoid_activation) + (1 - self.y) 
                                         * np.log(1 - sigmoid_activation), axis=1))
        return cross_entropy


    def predict(self, x):
        #return sigmoid(np.dot(x, self.W) + self.b)
        return softmax(np.dot(x, self.W) + self.b)


# Example/Testing for LR
def test_lr(learning_rate=0.01, n_epochs=200):
    # training data
    x = np.array([[1,1,1,0,0,0],
                  [1,0,1,0,0,0],
                  [1,1,1,0,0,0],
                  [0,0,1,1,1,0],
                  [0,0,1,1,0,0],
                  [0,0,1,1,1,0]])
    y = np.array([[1, 0],
                  [1, 0],
                  [1, 0],
                  [0, 1],
                  [0, 1],
                  [0, 1]])


    # construct LogisticRegression
    classifier = LogisticRegression(input=x, label=y, n_in=6, n_out=2)

    # train
    for epoch in range(n_epochs):
        classifier.train(lr=learning_rate)
        cost = classifier.negative_log_likelihood()
        print('Training epoch %d, cost is ' % epoch, cost)
        learning_rate *= 0.95


    # test
    x = np.array([1, 1, 0, 0, 0, 0])
    print(classifier.predict(x))
    print("Pred", np.argmax(classifier.predict(x)))

# Our Classification
def test_dbs(learning_rate=0.01, n_epochs=200):
    #x = train_db.as_matrix()
    x = rand_db.as_matrix()
    y = pd.get_dummies(target).as_matrix()
    
    classifier = LogisticRegression(input=x, label=y, n_in=len(x[0]), n_out=8)
    
    for epoch in range(n_epochs):
        classifier.train(lr=learning_rate)
        cost = classifier.negative_log_likelihood()
        print('Training epoch %d, cost is ' % epoch, cost)
        learning_rate *= 0.95
        
    #x = test_db.as_matrix()
    x = rand_tdb.as_matrix()
    #print(np.argmax(classifier.predict(x)))
    return classifier.predict(x)

if __name__ == "__main__":
    #test_lr()
    pred_prob = test_dbs(learning_rate=0.00000000001, n_epochs=500)
    print(pred_prob)

Training epoch 0, cost is  3.014160340608614
Training epoch 1, cost is  3.0141594386404407
Training epoch 2, cost is  3.0141585817729886
Training epoch 3, cost is  3.0141577677509965
Training epoch 4, cost is  3.014156994431988
Training epoch 5, cost is  3.01415625978063
Training epoch 6, cost is  3.0141555618633737
Training epoch 7, cost is  3.0141548988433655
Training epoch 8, cost is  3.0141542689756076
Training epoch 9, cost is  3.0141536706023655
Training epoch 10, cost is  3.014153102148803
Training epoch 11, cost is  3.0141525621188374
Training epoch 12, cost is  3.014152049091199
Training epoch 13, cost is  3.014151561715691
Training epoch 14, cost is  3.0141510987096334
Training epoch 15, cost is  3.0141506588544886
Training epoch 16, cost is  3.0141502409926506
Training epoch 17, cost is  3.014149844024401
Training epoch 18, cost is  3.014149466905012
Training epoch 19, cost is  3.014149108641997
Training epoch 20, cost is  3.0141487682924977
Training epoch 21, cost is  3.014

Training epoch 177, cost is  3.0141423037767576
Training epoch 178, cost is  3.0141423036738977
Training epoch 179, cost is  3.014142303576181
Training epoch 180, cost is  3.01414230348335
Training epoch 181, cost is  3.01414230339516
Training epoch 182, cost is  3.0141423033113806
Training epoch 183, cost is  3.014142303231789
Training epoch 184, cost is  3.014142303156178
Training epoch 185, cost is  3.014142303084347
Training epoch 186, cost is  3.014142303016108
Training epoch 187, cost is  3.0141423029512806
Training epoch 188, cost is  3.0141423028896943
Training epoch 189, cost is  3.014142302831188
Training epoch 190, cost is  3.014142302775607
Training epoch 191, cost is  3.014142302722804
Training epoch 192, cost is  3.014142302672642
Training epoch 193, cost is  3.014142302624988
Training epoch 194, cost is  3.0141423025797165
Training epoch 195, cost is  3.014142302536709
Training epoch 196, cost is  3.0141423024958507
Training epoch 197, cost is  3.0141423024570364
Trainin

Training epoch 365, cost is  3.014142301719694
Training epoch 366, cost is  3.0141423017196876
Training epoch 367, cost is  3.014142301719682
Training epoch 368, cost is  3.014142301719675
Training epoch 369, cost is  3.01414230171967
Training epoch 370, cost is  3.0141423017196636
Training epoch 371, cost is  3.014142301719659
Training epoch 372, cost is  3.014142301719654
Training epoch 373, cost is  3.0141423017196494
Training epoch 374, cost is  3.014142301719645
Training epoch 375, cost is  3.0141423017196405
Training epoch 376, cost is  3.0141423017196365
Training epoch 377, cost is  3.014142301719633
Training epoch 378, cost is  3.0141423017196294
Training epoch 379, cost is  3.0141423017196254
Training epoch 380, cost is  3.0141423017196227
Training epoch 381, cost is  3.014142301719619
Training epoch 382, cost is  3.0141423017196165
Training epoch 383, cost is  3.014142301719614
Training epoch 384, cost is  3.014142301719611
Training epoch 385, cost is  3.0141423017196085
Trai

In [253]:
# Check Predictions
preds = [np.argmax(x)+1 for x in pred_prob]
Counter(preds)

Counter({8: 10000})

FAILURE OF DNN - FIND WHY LOSS IS SO GREAT AND MAYBE WE CAN USE

In [44]:
import numpy as np
import sys
import matplotlib.pyplot as plt

#Computes Matrix Multiplication
# Returns Z = A x W + b, cache
# A - nxd / n -> num of training data points
# W - dxd'/ d -> num of i/p features
# b - d'  / d'-> num of o/p features
# Z - nxd'
def Affine_Forward(A,W,b):
    Z = np.matmul(A,W)
    Z = Z+np.array([b]*A.shape[0])
    return Z,(A,W,b)

def Affine_Backwards(dZ,cache):
    A=cache[0]
    W=cache[1]
    b=cache[2]

    dA = np.zeros(A.shape)
    dW = np.zeros(W.shape)
    db = np.zeros(b.shape)

    W_T = W.transpose()
    dA=np.matmul(dZ,W_T)

    A_T = A.transpose()
    dW = np.matmul(A_T,dZ)

    for j in range(b.shape[0]):
        db[j]+=sum(dZ[:,j])

    return dA,dW,db

def ReLU_Forward(Z):
    Z_cache = np.array(Z)
    return np.maximum(Z,0,Z),Z_cache

def ReLU_Backward(dA,Z_cache):
    dZ = np.zeros(dA.shape)
    dZ = np.where(Z_cache>0.0,dA,0)
    return dZ

def Cross_Entropy(F,y):
    L_sub = 0
    n = F.shape[0]

    #print("y", y)
    for i in range(n):
        #print("F[i]",i, F[i])
        L_sub += F[i,int(y[i]-1)] - np.log(sum(np.exp(F[i])))

    L = -1.0/n * L_sub

    dF = np.zeros(F.shape)

    for i in range(F.shape[0]):
        for j in range(F.shape[1]):
            if(j==int(y[i])):
                match=1
            else:
                match=0
            if n ==0:
                print("N")
            if sum(np.exp(F[i])) == 0:
                #print("F", F)
                print("F[i]", F[i])
                #print("exp", np.exp(F[i]))
            
            dF[i,j] = -1.0/n*(match-(np.exp(F[i,j])/sum(np.exp(F[i]))))
    return L,dF

#Learning Rate
eta = 0.1
#NeuronCount at Layer1,2,3
Neurons=[64,8]
#Batch Size
b_size = 100
#Weights
W1 = []
W2 = []
#bias
b1 = []
b2 = []

#Four Layer Neural Net
#X      - input batch of n datapoints with d observations each
#W_all  - Weights for all 4 layers
#b_all  - bias for all 4 layers
#y      - training labels/correct actions
#test   - select Train/Test mode for network
def FourLvlNN(X,y,test,min_loss):
    global W1
    global W2
    
    global b1
    global b2

    Z1,cache_a1 = Affine_Forward(X,W1,b1)
    #print("Z1", Z1)
    #print("X", X)
    A1,cache_r1 = ReLU_Forward(Z1)
    #print("A1", A1)
    F,cache_a2  = Affine_Forward(A1,W2,b2)
    #print("F", F)

    if(test==True):
        classification = np.zeros((F.shape[0],))
        classification = np.argmax(F,axis=1)
        return classification
    
    loss,dF     = Cross_Entropy(F,y)
    #print("L", loss)
    #print("dF", dF)
    dA1,dW2,db2 = Affine_Backwards(dF,cache_a2)
    dZ1         = ReLU_Backward(dA1,cache_r1)
    #print("dZ1", dZ1)
    dX,dW1,db1  = Affine_Backwards(dZ1,cache_a1)

    if(loss<min_loss):
        min_loss=loss
        np.save("Weights.npy",np.asarray([W1,W2]))
        np.save("Bias.npy",np.asarray([b1,b2]))

    #Gradient Descent
    W1 = W1 - eta*dW1
    W2 = W2 - eta*dW2
    #print("1", dW1)
    #print("2", dW2)
    #print("3", dW3)

    return loss

def load_DataSet():
    data=np.loadtxt("expert_policy.txt")
    return data

def bAndW_init(num_ip_features):
    global W1
    global W2
    
    global b1
    global b2

    W1 = np.random.uniform(0,0.005,(num_ip_features,Neurons[0]))
    W2 = np.random.uniform(0,0.005,(Neurons[0],Neurons[1]))
    
    b1 = np.zeros((Neurons[0],))
    b2 = np.zeros((Neurons[1],))
    
def normalize(X):
    data = X.copy()
    for feature in data:
        mean = np.mean(feature)
        std = np.std(feature)
        feature -= mean
        feature /= std
    #print("D",data)
    #print("X",X)
    return data

def normalize_dataset(data):
    for feature in range(len(data)-1):
        mean = np.mean(data[feature])
        std = np.std(data[feature])
        data[feature] -= mean
        data[feature] /= std
    return data

def MiniBatchGD(data,epoch):
    global W1
    global W2
    
    global b1
    global b2
    print("Running MiniBatchGCD with",epoch,"epochs\n")
    num_ip_features = data.shape[1]-1
    #Global score for 'b's and 'W's
    bAndW_init(num_ip_features)

    loss_list=[]
    min_loss = 100

    for cycle in range(epoch):
        np.random.shuffle(data)
        for i in range(data.shape[0]//b_size):
            #displayProgess(i,data.shape[0]//b_size)
            X = data[i*b_size:(i+1)*b_size,:-1]
            #print("X",X)
            #X = normalize(X)
            y = data[i*b_size:(i+1)*b_size,-1]
            loss = FourLvlNN(X,y,False,min_loss)
        print("\nEpoch",cycle,"Loss",loss,"\n")
        loss_list.append(loss)

    np.save("LossCurve.npy",np.asarray(loss_list))

def evaluateAccuracy(data):
    global W1
    global W2
    
    global b1
    global b2

    W_all = np.load("Weights.npy")
    b_all = np.load("Bias.npy")
    W1 = W_all[0]
    W2 = W_all[1]
    b1 = b_all[0]
    b2 = b_all[1]

    X = data[:,:-1]
    y = data[:,-1]
    classifications =FourLvlNN(X,y,True,-1)
    correct=0
    for i in range(len(y)):
        if(classifications[i]==int(y[i])):
            correct+=1
    accuracy = correct/len(y)*100
    print("Testing Accuracy:",accuracy)

def displayLossCurve():
    loss_array=np.load("LossCurve.npy")
    plt.plot(loss_array)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

def displayProgess(i,N):
    sys.stdout.write('\r')
    sys.stdout.write("Batch %d/%d in progress"%(i+1,N))
    bars = int((i/N*100)//10)
    sys.stdout.write(" [%-10s]"%('='*bars+'>',))

def main():
    data = train_ohd.as_matrix()
    MiniBatchGD(data,10)
    displayLossCurve()
    evaluateAccuracy(data)

main()

Running MiniBatchGCD with 10 epochs






Epoch 0 Loss nan 


Epoch 1 Loss nan 


Epoch 2 Loss nan 



KeyboardInterrupt: 