In [1]:
from pandas import read_excel, DataFrame, IndexSlice
from random import choices
from sklearn.preprocessing import LabelBinarizer, normalize
from sklearn.model_selection import train_test_split

In [2]:
data_path = 'data/Take home exam dataset.xlsx'
df_groups = read_excel(data_path, header=[0,1])
df_groups.drop(['Unnamed: 0_level_0', 'Unnamed: 1_level_0'], axis=1, inplace=True, level=0)
headers = ['GROUP', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15']
df_groups

Unnamed: 0_level_0,Q1,Q1,Q1,Q2,Q2,Q2,Q3,Q3,Q3,Q4,...,Q12,Q13,Q13,Q13,Q14,Q14,Q14,Q15,Q15,Q15
Unnamed: 0_level_1,Yes,No,Undecided,Yes,No,Undecided,Yes,No,Undecided,Yes,...,Undecided,Yes,No,Undecided,Yes,No,Undecided,Yes,No,Undecided
0,94,6,0,11,89,0,0,100,0,11,...,0,89,11,0,100,0,0,56,38,6
1,72,28,0,28,72,0,46,54,0,61,...,0,61,39,0,50,50,0,56,44,0
2,89,11,0,17,83,0,6,94,0,50,...,0,89,11,0,94,6,0,28,55,17
3,28,72,0,0,100,0,94,6,0,6,...,0,94,6,0,50,44,6,0,78,22


In [3]:
def standardize_dataset(df_groups, possible_answers=[1, 0, 0.5]):
    num_people = 20
    people = []
    headers = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15']
    for group_num, group in enumerate(['g1', 'g2', 'g3', 'g4']):
        for person in range(num_people):
            answers = [group]
            for question in headers:
                answer_distributions = df_groups[question]
                weights = [
                    answer_distributions.Yes[group_num]/100,
                    answer_distributions.No[group_num]/100,
                    answer_distributions.Undecided[group_num]/100
                ]
                answers.extend(choices(possible_answers, weights))
            people.append(answers)
    df_groups.columns.levels[0][:-2].sort_values()
    return people

In [4]:
people = standardize_dataset(df_groups)
df = DataFrame(people, columns=headers)
df

Unnamed: 0,GROUP,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15
0,g1,1,0,0,0.5,0.5,0,0,0,0,0,1,1,0,1.0,1.0
1,g1,1,0,0,0.5,0.5,0,0,0,1,0,1,1,1,1.0,0.0
2,g1,1,0,0,0.0,1.0,0,0,0,0,0,1,1,1,1.0,1.0
3,g1,1,0,0,0.5,0.0,1,0,0,1,0,1,1,1,1.0,1.0
4,g1,1,1,0,0.5,0.5,0,0,0,0,0,1,1,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,g4,1,0,1,0.0,1.0,1,0,0,1,0,1,1,1,0.5,0.0
76,g4,0,0,1,0.0,1.0,1,0,0,1,0,1,1,1,0.0,0.0
77,g4,0,0,1,0.0,1.0,1,0,0,1,0,1,1,1,1.0,0.0
78,g4,1,0,1,0.0,1.0,1,0,0,1,0,1,1,1,1.0,0.0


In [5]:
# One Hot Encoding the Group Categories
group_lb = LabelBinarizer()
Y = group_lb.fit_transform(df.GROUP.values)

# Normalize the X input
FEATURES = df.columns[1:]
X_data = df[FEATURES].values
X_data = normalize(X_data)
X_data

array([[0.42640143, 0.        , 0.        , ..., 0.        , 0.42640143,
        0.42640143],
       [0.39223227, 0.        , 0.        , ..., 0.39223227, 0.39223227,
        0.        ],
       [0.37796447, 0.        , 0.        , ..., 0.37796447, 0.37796447,
        0.37796447],
       ...,
       [0.        , 0.        , 0.35355339, ..., 0.35355339, 0.35355339,
        0.        ],
       [0.33333333, 0.        , 0.33333333, ..., 0.33333333, 0.33333333,
        0.        ],
       [0.34815531, 0.        , 0.34815531, ..., 0.34815531, 0.34815531,
        0.17407766]])

In [6]:
# Data Split into 70% training 30% testing
X_train, X_test, y_train, y_test = train_test_split(X_data, Y, test_size=0.3)#, random_state=1)
X_train.shape

(56, 15)

# Classifier 1 - Artificial Neural Network Method

In [7]:
import tensorflow.compat.v1 as tf
from numpy import mean, argmax
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [8]:
# Parameters
learning_rate = 0.01
training_epochs = 100

# Neural Network Parameters
n_hidden_1 = 20
n_hidden_2 = 16

n_input = X_train.shape[1] # input shape (105, 4)
n_classes = y_train.shape[1] # classes to predict

In [9]:
# Inputs
X = tf.placeholder("float", shape=[None, n_input])
y = tf.placeholder("float", shape=[None, n_classes])
# Dictionary of Weights and Biases
weights = {
  'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
  'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
  'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
  'b1': tf.Variable(tf.random_normal([n_hidden_1])),
  'b2': tf.Variable(tf.random_normal([n_hidden_2])),
  'out': tf.Variable(tf.random_normal([n_classes]))
}

In [10]:
# Model Forward Propagation step
def forward_propagation(x):
    # Hidden layer1
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output fully connected layer
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out'] 
    return out_layer

In [11]:
# Model Outputs
yhat = forward_propagation(X)
ypredict = tf.argmax(yhat, axis=1)

In [12]:
# Back Propagation
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=yhat))
# optimizer = tf.train.GradientDescentOptimizer(learning_rate)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [13]:
init = tf.global_variables_initializer()
from datetime import datetime
startTime = datetime.now()
with tf.Session() as sess:
    sess.run(init)
    # Running the model through epochs
    for epoch in range(training_epochs):
        for i in range(len(X_train)):
            summary = sess.run(train_op, feed_dict={X: X_train[i: i + 1], y: y_train[i: i + 1]})
        
        train_accuracy = mean(argmax(y_train, axis=1) == sess.run(ypredict, feed_dict={X: X_train, y: y_train}))
        test_accuracy  = mean(argmax(y_test, axis=1) == sess.run(ypredict, feed_dict={X: X_test, y: y_test}))
                
        print(f"Epoch = {epoch + 1}, train accuracy = {100. * train_accuracy:.2f}, test accuracy = {100. * test_accuracy:.2f}")
    sess.close()
print("Time taken:", datetime.now() - startTime)

Epoch = 1, train accuracy = 39.29, test accuracy = 33.33
Epoch = 2, train accuracy = 55.36, test accuracy = 45.83
Epoch = 3, train accuracy = 69.64, test accuracy = 58.33
Epoch = 4, train accuracy = 73.21, test accuracy = 62.50
Epoch = 5, train accuracy = 80.36, test accuracy = 75.00
Epoch = 6, train accuracy = 87.50, test accuracy = 79.17
Epoch = 7, train accuracy = 87.50, test accuracy = 79.17
Epoch = 8, train accuracy = 91.07, test accuracy = 79.17
Epoch = 9, train accuracy = 91.07, test accuracy = 79.17
Epoch = 10, train accuracy = 92.86, test accuracy = 79.17
Epoch = 11, train accuracy = 94.64, test accuracy = 75.00
Epoch = 12, train accuracy = 96.43, test accuracy = 79.17
Epoch = 13, train accuracy = 94.64, test accuracy = 75.00
Epoch = 14, train accuracy = 96.43, test accuracy = 75.00
Epoch = 15, train accuracy = 96.43, test accuracy = 79.17
Epoch = 16, train accuracy = 96.43, test accuracy = 75.00
Epoch = 17, train accuracy = 94.64, test accuracy = 75.00
Epoch = 18, train accur

# Classifier 2 - Statistical Probabilistic Classification

In [14]:
people = standardize_dataset(df_groups, possible_answers=['Yes', 'No', 'Undecided'])
df = DataFrame(people, columns=headers)
df

Unnamed: 0,GROUP,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15
0,g1,Yes,No,No,Undecided,No,No,No,No,No,No,Yes,Yes,Yes,Yes,Yes
1,g1,Yes,No,No,Undecided,No,Yes,No,No,Yes,No,Yes,Yes,Yes,Yes,No
2,g1,Yes,No,No,No,Undecided,No,No,No,Yes,No,Yes,Yes,Yes,Yes,No
3,g1,Yes,No,No,Yes,Yes,No,No,No,Yes,No,Yes,Yes,Yes,Yes,Yes
4,g1,Yes,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,g4,No,No,Yes,No,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,Yes,No
76,g4,No,No,Yes,Yes,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,No,No
77,g4,No,No,Yes,No,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,No,No
78,g4,No,No,Yes,No,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,No,Undecided


In [15]:
Y = df.GROUP.values

# Normalize the X input
FEATURES = df.columns[1:]
X_data = df[FEATURES].values
X_data

array([['Yes', 'No', 'No', ..., 'Yes', 'Yes', 'Yes'],
       ['Yes', 'No', 'No', ..., 'Yes', 'Yes', 'No'],
       ['Yes', 'No', 'No', ..., 'Yes', 'Yes', 'No'],
       ...,
       ['No', 'No', 'Yes', ..., 'Yes', 'No', 'No'],
       ['No', 'No', 'Yes', ..., 'Yes', 'No', 'Undecided'],
       ['No', 'No', 'Yes', ..., 'Yes', 'No', 'No']], dtype=object)

In [16]:
groups = ['g1', 'g2', 'g3', 'g4']
correct = 0
for person, group in zip(X_data, Y):
    scores = [0, 0, 0, 0]
    for i, question in enumerate(headers[1:]):
        answer_given = person[i]
        for j, probability in enumerate(df_groups[question][answer_given]):
            scores[j] += probability
    if groups[scores.index(max(scores))] == group:
        correct += 1
accuracy = correct / len(X_data)
accuracy

0.6