Opening notes

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [196]:
#First row is headers, so just simple import on the csv data using pandas
train_csv = pd.read_csv("au_train.csv")
test_csv = pd.read_csv("au_test.csv")

#Remove "education" column from each, as the next column is a numerical representation of it
train_csv.drop('education', axis=1, inplace=True)
test_csv.drop('education', axis=1, inplace=True)

#Remove period from last character in class for test cases
test_csv['class'] = test_csv['class'].str.replace('.', '')

Taking a look at the data, there are 14 variable columns, as well as the "class", or target column.
We want to convert object  columns to discrete data. We can do this by hand, but an easier way is to use pandas builtin Categorical functionality.

In [3]:
#Convert object columns to discrete numerical values
for col in train_csv:
    if train_csv[col].dtype == np.dtype('object'):
        temp_col = pd.Categorical(train_csv[col])
        temp_col = temp_col.codes
        train_csv[col] = temp_col

#Do the same for the test data
for col in test_csv:
    if test_csv[col].dtype == np.dtype('object'):
        temp_col = pd.Categorical(test_csv[col])
        temp_col = temp_col.codes
        test_csv[col] = temp_col

Now we can load the data for tensorflow and start making the model

In [4]:
#Pop the class columns off each dataset to save as targets for each
train_y = train_csv.pop('class')
test_y = test_csv.pop('class')

#Load train csv into tensor, then shuffle and create batches
train_dataset = tf.data.Dataset.from_tensor_slices((train_csv.values, train_y.values))
train_dataset = train_dataset.shuffle(len(train_csv)).batch(50)

#Load test csv into tensor - need to create batch for evaluation function to work later
test_dataset = tf.data.Dataset.from_tensor_slices((test_csv.values, test_y.values))
test_dataset = test_dataset.batch(len(test_csv))

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(500, activation='relu'),
    tf.keras.layers.Dense(250, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'])

In [6]:
#model.fit(train_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1c5c8450c10>

In [7]:
#model.evaluate(test_dataset)



[19.49567413330078, 0.7966341376304626]

Notes on results

Notes on naive bayes, why naive bayes, start by getting the 3 categories (probabilities of each class, each var, each var for each class

In [130]:
from collections import defaultdict
import math

In [202]:
#Reset datasets
#First row is headers, so just simple import on the csv data using pandas
train_csv = pd.read_csv("au_train.csv")
test_csv = pd.read_csv("au_test.csv")

#Remove "education" column from each, as the next column is a numerical representation of it
train_csv.drop('education', axis=1, inplace=True)
test_csv.drop('education', axis=1, inplace=True)

#Remove period from last character in class for test cases
test_csv['class'] = test_csv['class'].str.replace('.', '')

In [203]:
def get_continuous_probability(mean, var, val):
    return 1/(math.sqrt(2*math.pi)*var) * math.e**-((val-mean)/var**2)

def get_dataset_probabilities(ds):
    #Get # of rows in dataset
    total = len(ds)
    feature_probs = {}
    
    for col in ds:
        #Don't want to count target col
        if col == 'class':
            continue

        #If column is continuous, use Gaussian Distribution to get probability
        if ds[col].dtype == np.dtype('int64'):
            feature_probs[col] = {'mean': np.mean(ds[col]), 'var': np.var(ds[col])}#get_continuous_probability(ds[col])

        #If column is object, count each item and divide by # of rows
        elif ds[col].dtype == np.dtype('object'):
            temp = ds[col].value_counts().to_dict()
            for i in temp.keys():
                temp[i] = temp[i]/total

            feature_probs[col] = temp
    
    return feature_probs

In [204]:
#Get class counts: get the total counts of each class, then divide by # of rows
total = len(train_csv)
class_probs = train_csv['class'].value_counts().to_dict()
for i in class_probs.keys():
    class_probs[i] = class_probs[i]/total

#Feature counts
feature_probs = get_dataset_probabilities(train_csv)
#Features based on class
feature_class_probs = {}
for k in class_probs.keys():
    #Just get the rows belonging to the specific class
    current_class = train_csv.loc[train_csv['class'] == k]
    feature_class_probs[k] = get_dataset_probabilities(current_class)
    
#Get list of keys for iteration
key_list = list(feature_probs.keys())

Now we have probabilities, let's iterate over each row and guess

In [208]:
def test_dataset(ds):
    total_right = 0
    for i, row in ds.iterrows():
        
        #Iterate over each class to see which class has greatest probability
        most_likely = defaultdict(float)
        for cls in list(class_probs.keys()):
            
            #Reset likelihood and prior to 1
            likelihood = 1
            prior = 1

            #Iterate over each column in row
            for col_idx, item in enumerate(row):
                #If item is the target class, ignore
                if item in list(class_probs.keys()):
                    continue
                
                #If int (continuous), get the probability with mean and variance
                if type(item) == int:
                    #Likelihood
                    mean = feature_class_probs[cls][key_list[col_idx]]['mean']
                    var = feature_class_probs[cls][key_list[col_idx]]['var']
                    likelihood *= get_continuous_probability(mean, var, item)

                    #Prior
                    mean = feature_probs[key_list[col_idx]]['mean']
                    var = feature_probs[key_list[col_idx]]['var']
                    prior *= get_continuous_probability(mean, var, item)

                #If str (object), get probability that was saved earlier
                elif type(item) == str:
                    #Key doesn't exist for that class
                    try:
                        likelihood *= feature_class_probs[cls][key_list[col_idx]][item]
                    except KeyError:
                        likelihood *= 0   
                    prior = feature_probs[key_list[col_idx]][item]

            most_likely[cls] = (likelihood * class_probs[cls]) / prior

        selection = max(most_likely, key=most_likely.get)
        if selection == row[-1]:
            total_right+=1
    
    return total_right

In [209]:
print(test_dataset(train_csv) / len(train_csv))

0.7591904425539756


In [210]:
print(test_dataset(test_csv) / len(test_csv))

0.7637737239727289


Analysis on results