# Notre Dame Football Game Prediction

## 0. Load data

### 0.1. Load training data from file

In [6]:
import pandas as pd
dfTrain = pd.read_csv("./datasets/dataset-football/dataset-football-train.txt", delimiter='\t')
dfTrain

Unnamed: 0,ID,Date,Opponent,Is_Home_or_Away,Is_Opponent_in_AP25_Preseason,Media,Label
0,1,9/5/15,Texas,Home,Out,NBC,Win
1,2,9/12/15,Virginia,Away,Out,ABC,Win
2,3,9/19/15,Georgia Tech,Home,In,NBC,Win
3,4,9/26/15,UMass,Home,Out,NBC,Win
4,5,10/3/15,Clemson,Away,In,ABC,Lose
5,6,10/10/15,Navy,Home,Out,NBC,Win
6,7,10/17/15,USC,Home,In,NBC,Win
7,8,10/31/15,Temple,Away,Out,ABC,Win
8,9,11/7/15,PITT,Away,Out,ABC,Win
9,10,11/14/15,Wake Forest,Home,Out,NBC,Win


### 0.2. Load test data from file

In [7]:
dfTest = pd.read_csv("./datasets/dataset-football/dataset-football-test.txt", delimiter='\t')
dfTest

Unnamed: 0,ID,Date,Opponent,Is_Home_or_Away,Is_Opponent_in_AP25_Preseason,Media,Label
0,25,9/2/17,Temple,Home,Out,NBC,Win
1,26,9/9/17,Georgia,Home,In,NBC,Lose
2,27,9/16/17,Boston College,Away,Out,ESPN,Win
3,28,9/23/17,Michigan State,Away,Out,FOX,Win
4,29,9/30/17,Miami Ohio,Home,Out,NBC,Win
5,30,10/7/17,North Carolina,Away,Out,ABC,Win
6,31,10/21/17,USC,Home,In,NBC,Win
7,32,10/28/17,North Carolina State,Home,Out,NBC,Win
8,33,11/4/17,Wake Forest,Home,Out,NBC,Win
9,34,11/11/17,Miami Florida,Away,In,ABC,Lose


### 0.3. Find feature names

In [8]:
featureNames = dfTrain.columns[3:-1] # Skip "ID", "Date", "Opponent"; and "Label"
nFeatures = featureNames.shape[0]
print("Number of features:", nFeatures)
featureNames

Number of features: 3


Index(['Is_Home_or_Away', 'Is_Opponent_in_AP25_Preseason', 'Media'], dtype='object')

### 0.4. Select training instances (feature values and label) from training data

In [9]:
trainingInstances = []
for instance in dfTrain.to_numpy():
    featureValues = list(instance[3:-1])
    label = instance[-1]
    trainingInstances.append([featureValues,label])
nTrainingInstances = len(trainingInstances)
print("Number of training instances:", nTrainingInstances)
trainingInstances

Number of training instances: 24


[[['Home', 'Out', 'NBC'], 'Win'],
 [['Away', 'Out', 'ABC'], 'Win'],
 [['Home', 'In', 'NBC'], 'Win'],
 [['Home', 'Out', 'NBC'], 'Win'],
 [['Away', 'In', 'ABC'], 'Lose'],
 [['Home', 'Out', 'NBC'], 'Win'],
 [['Home', 'In', 'NBC'], 'Win'],
 [['Away', 'Out', 'ABC'], 'Win'],
 [['Away', 'Out', 'ABC'], 'Win'],
 [['Home', 'Out', 'NBC'], 'Win'],
 [['Away', 'Out', 'NBC'], 'Win'],
 [['Away', 'In', 'FOX'], 'Lose'],
 [['Away', 'Out', 'ABC'], 'Lose'],
 [['Home', 'Out', 'NBC'], 'Win'],
 [['Home', 'Out', 'NBC'], 'Lose'],
 [['Home', 'Out', 'NBC'], 'Lose'],
 [['Home', 'Out', 'ESPN'], 'Win'],
 [['Away', 'Out', 'ABC'], 'Lose'],
 [['Home', 'In', 'NBC'], 'Lose'],
 [['Home', 'Out', 'NBC'], 'Win'],
 [['Home', 'Out', 'CBS'], 'Lose'],
 [['Home', 'Out', 'NBC'], 'Win'],
 [['Home', 'In', 'NBC'], 'Lose'],
 [['Away', 'In', 'ABC'], 'Lose']]

### 0.5. Select test instances from test data

In [10]:
testInstances = []
for instance in dfTest.to_numpy():
    featureValues = list(instance[3:-1])
    label = instance[-1]
    testInstances.append([featureValues,label])
nTestInstances = len(testInstances)
print("Number of test instances:", nTestInstances)

Number of test instances: 12


## 1. ID3 model using Information Gain

### 1.1. Define information entropy function by distribution of value frequencies

In [11]:
import math
import numpy as np

# Examples:
# Suppose distr_freq = [2, 2, 2]. Should return -1/3*log_2(1/3) * 3 = 1.585.
# Suppose distr_freq = [6, 0, 0, 0, 0]. Should return 0.

def Entropy(distr_freq):
    ret = 0.0
    temp=[]
    distr_freq = [i for i in distr_freq if i != 0]
    ### TODO 1: ret = ________________ ###
    for i in range(len(distr_freq)):
        p=distr_freq[i]/sum(distr_freq)
        temp.append(-p * math.log(p,2))
    
    ret = sum(temp)
    return ret


### 1.2. Define Information Gain function by a set of instances and a particular feature

In [12]:
# Given a set of instances [instanceIDs] at the node,
# find the information gain if the node is split by the values of feature [featureID].
def IG(instanceIDs, featureID):
    
    label2freq, value2label2freq = {}, {}
    
    for instanceID in instanceIDs:
        featureValues, label = trainingInstances[instanceID]
        
        if not label in label2freq:
            label2freq[label] = 0
        label2freq[label] += 1
        
        featureValue = featureValues[featureID]
        
        if not featureValue in value2label2freq:
            value2label2freq[featureValue] = {}
        if not label in value2label2freq[featureValue]:
            value2label2freq[featureValue][label] = 0
        value2label2freq[featureValue][label] += 1
        
    N = len(instanceIDs)
    
    distr_freq = []
    for label,freq in label2freq.items():
        distr_freq.append(freq)
        
    # This is the information entropy at the node H(Y)
    H_Y = Entropy(distr_freq)
    
    H_Y_cond_X = 0.0
    for value, label2freq in value2label2freq.items():
        distr_freq = []
        for label, freq in label2freq.items():
            distr_freq.append(freq)
        # for each value of the feature [featureID],
        # calculate the conditional entropy H(Y|X) at the child nodes (if the node is split by X)
        
        ### TODO 2: H_Y_cond_X = ________________ ###
        H_Y_cond_X += (sum(distr_freq)/N) * Entropy(distr_freq)
    return H_Y - H_Y_cond_X

### 1.3. Define Gain Ratio function by a set of instances and a particular feature

In [13]:
def GainRatio(instanceIDs, featureID):
    
    ret = IG(instanceIDs, featureID)
    if ret == 0: return ret
    
    # calculate the splitInfo to calculate Gain Ratio based on Information Gain
    
    ### TODO 3: splitInfo = ________________ ###
    label2freq, value2label2freq = {}, {}
    
    for instanceID in instanceIDs:
        featureValues, label = trainingInstances[instanceID]
        
        if not label in label2freq:
            label2freq[label] = 0
        label2freq[label] += 1
        
        featureValue = featureValues[featureID]
        
        if not featureValue in value2label2freq:
            value2label2freq[featureValue] = {}
        if not label in value2label2freq[featureValue]:
            value2label2freq[featureValue][label] = 0
        value2label2freq[featureValue][label] += 1
    
    splitInfo = 0 
    for value, label2freq in value2label2freq.items():
        distr_freq = []
        for label, freq in label2freq.items():
            distr_freq.append(freq)
            
        splitInfo += Entropy(distr_freq)   
        
    return ret/splitInfo

### 1.4. Define a function (an iterative algorithm) to construct a decision tree

A decision tree node has the following components:

**instanceIDs**: the set of training instances on the node;
    
**featureIDs**: the set of available features for splitting; those that are not in this set have been used on the path from the root node to this node;
    
**level**: the node is at which level of the tree; 0 is for root node;
    
**subTree**: its child nodes (sub-trees) formed by the best feature;
    
**bestFeatureID**: the best feature for splitting into child nodes;
    
**majorityLabel**: the decision on this node (the most frequent label of the instances on this node);
    
**parentMajorityLabel**: the decision on this node's parent.

In [14]:
def ConstructDecisionTree(tree, modelName):

    instanceIDs, featureIDs, level, _, _, _, parentMajorityLabel = tree

    # Calculate the distribution of label frequencies on this node
    label2freq = {}
    for instanceID in instanceIDs:
        label = trainingInstances[instanceID][1]
        if not label in label2freq:
            label2freq[label] = 0
        label2freq[label] += 1
    # Sort the label frequencies from the highest to lowest
    label_freq = sorted(label2freq.items(), key=lambda x:-x[1])
    
    # Determine the decision on this node
    majorityLabel = label_freq[0][0]
    # When there are multiple labels that have the highest frequency - use the decision on the parent node.
    if len(label_freq) > 1 and label_freq[0][1] == label_freq[1][1]:
        majorityLabel = parentMajorityLabel
    # Set the [majorityLabel]
    tree[5] = majorityLabel
    print("\t"*level, "Decision:", majorityLabel, "<--", label_freq)
    
    if len(label_freq) == 1: return
    if len(featureIDs) == 0: return

    # Calculate the score of splitting criterion
    # for each feature [featureID]
    featureID2criterionScore = {}
    for featureID in featureIDs:
        if modelName == "ID3":
            ### TODO 4: criterionScore = ________________ ###
            criterionScore = IG(instanceIDs, featureID)
        
        elif modelName == "C4.5":
            ### TODO 5: criterionScore = ________________ ###
            criterionScore = GainRatio(instanceIDs, featureID)

        featureID2criterionScore[featureID] = criterionScore
    # Sort the feature's criterion score from the highest to lowest
    featureID_criterionScore = sorted(featureID2criterionScore.items(), key=lambda x:-x[1])
    
    # Find the best feature and the best criterion score
    bestFeatureID = featureID_criterionScore[0][0]
    bestCriterionScore = featureID_criterionScore[0][1]
    
    # Terminate when there is not effective feature
    if bestCriterionScore == 0.0: return
    
    tree[4] = bestFeatureID

    # Remove the best feature from feature candidates for the sub-trees
    subFeatureIDs = set()
    for featureID in featureIDs:
        if featureID == bestFeatureID: continue
        subFeatureIDs.add(featureID)
    
    # Go to the next level: Splitting the node into child nodes (i.e., sub-trees)
    level += 1
    value2subInstanceIDs = {}
    for instanceID in instanceIDs:
        value = trainingInstances[instanceID][0][bestFeatureID]
        if not value in value2subInstanceIDs:
            value2subInstanceIDs[value] = set()
        value2subInstanceIDs[value].add(instanceID)
        
    for value,subInstanceIDs in value2subInstanceIDs.items():
        tree[3][value] = [subInstanceIDs, subFeatureIDs, level, {}, None, None, majorityLabel]
        print("\t"*level, "Level", level, "=>", featureNames[bestFeatureID], ":", value,
              "( criterion:", np.round(bestCriterionScore, 4), ")")
        print("\t"*level, "{} is {:<6}".format(
            featureNames[bestFeatureID], value))
        print("\t"*level, "Criterion: {}".format(
            np.round(bestCriterionScore, 4)))
        ConstructDecisionTree(tree[3][value], modelName)

### 1.5. Construct an ID3 tree from the root node

In [15]:
tree = [set(range(nTrainingInstances)), set(range(nFeatures)), 0, {}, None, None, None]
print("Level", 0)
ConstructDecisionTree(tree, "ID3")

Level 0
 Decision: Win <-- [('Win', 14), ('Lose', 10)]
	 Level 1 => Media : NBC ( criterion: 0.189 )
	 Decision: Win <-- [('Win', 10), ('Lose', 4)]
		 Level 2 => Is_Opponent_in_AP25_Preseason : Out ( criterion: 0.0617 )
		 Decision: Win <-- [('Win', 8), ('Lose', 2)]
			 Level 3 => Is_Home_or_Away : Home ( criterion: 0.0341 )
			 Decision: Win <-- [('Win', 7), ('Lose', 2)]
			 Level 3 => Is_Home_or_Away : Away ( criterion: 0.0341 )
			 Decision: Win <-- [('Win', 1)]
		 Level 2 => Is_Opponent_in_AP25_Preseason : In ( criterion: 0.0617 )
		 Decision: Win <-- [('Win', 2), ('Lose', 2)]
	 Level 1 => Media : ABC ( criterion: 0.189 )
	 Decision: Lose <-- [('Lose', 4), ('Win', 3)]
		 Level 2 => Is_Opponent_in_AP25_Preseason : Out ( criterion: 0.2917 )
		 Decision: Win <-- [('Win', 3), ('Lose', 2)]
		 Level 2 => Is_Opponent_in_AP25_Preseason : In ( criterion: 0.2917 )
		 Decision: Lose <-- [('Lose', 2)]
	 Level 1 => Media : FOX ( criterion: 0.189 )
	 Decision: Lose <-- [('Lose', 1)]
	 Level 1 =>

### 1.6. Define a function to traverse the decision tree

In [99]:
def Traverse(tree):
    
    instanceIDs, featureIDs, level, subTree, bestFeatureID, majorityLabel, parentMajorityLabel = tree
    print("\t"*level, "Level", level, "(", majorityLabel, ")")
    
    if tree[4] == None: return
    
    print("\t"*level, featureNames[bestFeatureID], ":")
    
    for value, branch in sorted(subTree.items(), key=lambda x:x[0]):
        print("\t"*level, value)
        Traverse(branch)

### 1.7. Traverse the ID3 tree from the root node

In [100]:
Traverse(tree)

Level 0 ( Win )
 Media :
 ABC
	 Level 1 ( Lose )
	 Is_Opponent_in_AP25_Preseason :
	 In
		 Level 2 ( Lose )
	 Out
		 Level 2 ( Win )
 CBS
	 Level 1 ( Lose )
 ESPN
	 Level 1 ( Win )
 FOX
	 Level 1 ( Lose )
 NBC
	 Level 1 ( Win )
	 Is_Opponent_in_AP25_Preseason :
	 In
		 Level 2 ( Win )
	 Out
		 Level 2 ( Win )
		 Is_Home_or_Away :
		 Away
			 Level 3 ( Win )
		 Home
			 Level 3 ( Win )


### 1.8. Define a function to predict the label based on features using the decision tree

In [101]:
def Predict(tree, featureValues):
    
    instanceIDs, featureIDs, level, subTree, bestFeatureID, majorityLabel, parentMajorityLabel = tree
    if tree[4] == None: return majorityLabel
    
    value = featureValues[bestFeatureID]
    
    if not value in subTree: return majorityLabel
    return Predict(subTree[value], featureValues)

### 1.9. Define a function to make prediction for each test instance in the set and calculate the Confusion Matrix

In [102]:
def GetConfusionMatrix(testInstances):
    
    TP, FP, TN, FN = 0, 0, 0, 0

    for featureValues, truthLabel in testInstances:

        predictLabel = Predict(tree, featureValues)

        print(featureValues, "Actual label:", truthLabel, "| Predicted label:", predictLabel)

        ### TODO 6: ###
        ### if ________________: TP += 1 ###
        if truthLabel == 'Win' and predictLabel == 'Win': TP+=1
        
        ### if ________________: FP += 1 ###
        if truthLabel == 'Lose' and predictLabel == 'Win': FP+=1
        
        ### if ________________: FN += 1 ###
        if truthLabel == 'Win' and predictLabel == 'Lose': FN+=1

        ### if ________________: TN += 1 ###
        if truthLabel == 'Lose' and predictLabel == 'Lose': TN+=1

    return TP, FP, TN, FN

### 1.10. Make prediction with ID3 and find the Confusion Matrix

In [103]:
TP, FP, TN, FN = GetConfusionMatrix(testInstances)

['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'In', 'NBC'] Actual label: Lose | Predicted label: Win
['Away', 'Out', 'ESPN'] Actual label: Win | Predicted label: Win
['Away', 'Out', 'FOX'] Actual label: Win | Predicted label: Lose
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'Out', 'ABC'] Actual label: Win | Predicted label: Win
['Home', 'In', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'In', 'ABC'] Actual label: Lose | Predicted label: Lose
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'In', 'ABC'] Actual label: Lose | Predicted label: Lose


### 1.11. Define a function to calcualte Accuracy, Precision, Recall, and F1 score based on the Confusion Matrix

In [104]:
def GetMetrics(TP, FP, TN, FN):
    print("Confusion matrix:")
    print("\t", "-"*15)
    print("\t", "| TP", TP, "| FN", FN, "|")
    print("\t", "| FP", FP, "| TN", TN, "|")
    print("\t", "-"*15)

    accuracy = 1.0*(TP+TN) / nTestInstances
    ### TODO 7: ###
    ### precision = ________________ ###
    precision = TP/(TP+FP)

    ### recall = ________________ ###
    recall = TP/(TP+FN)

    ### f1 = ________________ ###
    f1 = (2*TP)/(2*TP+FP+FN)

    print("Accuracy:", np.round(accuracy, 4))
    print("Precision:", np.round(precision, 4))
    print("Recall:", np.round(recall, 4))
    print("F1:", np.round(f1, 4))

### 1.12. Calculate Accuracy, Precision, Recall, and F1 score

In [105]:
GetMetrics(TP, FP, TN, FN)

Confusion matrix:
	 ---------------
	 | TP 8 | FN 1 |
	 | FP 1 | TN 2 |
	 ---------------
Accuracy: 0.8333
Precision: 0.8889
Recall: 0.8889
F1: 0.8889


In [72]:
print(testInstances)
print(featureNames)
print(trainingInstances)

[[['Home', 'Out', 'NBC'], 'Win'], [['Home', 'In', 'NBC'], 'Lose'], [['Away', 'Out', 'ESPN'], 'Win'], [['Away', 'Out', 'FOX'], 'Win'], [['Home', 'Out', 'NBC'], 'Win'], [['Away', 'Out', 'ABC'], 'Win'], [['Home', 'In', 'NBC'], 'Win'], [['Home', 'Out', 'NBC'], 'Win'], [['Home', 'Out', 'NBC'], 'Win'], [['Away', 'In', 'ABC'], 'Lose'], [['Home', 'Out', 'NBC'], 'Win'], [['Away', 'In', 'ABC'], 'Lose']]
Index(['Is_Home_or_Away', 'Is_Opponent_in_AP25_Preseason', 'Media'], dtype='object')
[[['Home', 'Out', 'NBC'], 'Win'], [['Away', 'Out', 'ABC'], 'Win'], [['Home', 'In', 'NBC'], 'Win'], [['Home', 'Out', 'NBC'], 'Win'], [['Away', 'In', 'ABC'], 'Lose'], [['Home', 'Out', 'NBC'], 'Win'], [['Home', 'In', 'NBC'], 'Win'], [['Away', 'Out', 'ABC'], 'Win'], [['Away', 'Out', 'ABC'], 'Win'], [['Home', 'Out', 'NBC'], 'Win'], [['Away', 'Out', 'NBC'], 'Win'], [['Away', 'In', 'FOX'], 'Lose'], [['Away', 'Out', 'ABC'], 'Lose'], [['Home', 'Out', 'NBC'], 'Win'], [['Home', 'Out', 'NBC'], 'Lose'], [['Home', 'Out', 'NBC'

## 2. C4.5 model using Gain Ratio

### 2.1. Construct a C4.5 tree from the root node

In [106]:
tree = [set(range(nTrainingInstances)), set(range(nFeatures)), 0, {}, None, None, None]
print("Level", 0)
ConstructDecisionTree(tree, "C4.5")

Level 0
 Decision: Win <-- [('Win', 14), ('Lose', 10)]
  Level 1 => Media : NBC ( criterion: 0.1023 )
  Decision: Win <-- [('Win', 10), ('Lose', 4)]
   Level 2 => Is_Home_or_Away : Home ( criterion: 0.0407 )
   Decision: Win <-- [('Win', 9), ('Lose', 4)]
    Level 3 => Is_Opponent_in_AP25_Preseason : Out ( criterion: 0.0305 )
    Decision: Win <-- [('Win', 7), ('Lose', 2)]
    Level 3 => Is_Opponent_in_AP25_Preseason : In ( criterion: 0.0305 )
    Decision: Win <-- [('Win', 2), ('Lose', 2)]
   Level 2 => Is_Home_or_Away : Away ( criterion: 0.0407 )
   Decision: Win <-- [('Win', 1)]
  Level 1 => Media : ABC ( criterion: 0.1023 )
  Decision: Lose <-- [('Lose', 4), ('Win', 3)]
   Level 2 => Is_Opponent_in_AP25_Preseason : Out ( criterion: 0.3004 )
   Decision: Win <-- [('Win', 3), ('Lose', 2)]
   Level 2 => Is_Opponent_in_AP25_Preseason : In ( criterion: 0.3004 )
   Decision: Lose <-- [('Lose', 2)]
  Level 1 => Media : FOX ( criterion: 0.1023 )
  Decision: Lose <-- [('Lose', 1)]
  Level 1

### 2.2. Traverse the C4.5 tree from the root node

In [107]:
Traverse(tree)

Level 0 ( Win )
 Media :
 ABC
	 Level 1 ( Lose )
	 Is_Opponent_in_AP25_Preseason :
	 In
		 Level 2 ( Lose )
	 Out
		 Level 2 ( Win )
 CBS
	 Level 1 ( Lose )
 ESPN
	 Level 1 ( Win )
 FOX
	 Level 1 ( Lose )
 NBC
	 Level 1 ( Win )
	 Is_Home_or_Away :
	 Away
		 Level 2 ( Win )
	 Home
		 Level 2 ( Win )
		 Is_Opponent_in_AP25_Preseason :
		 In
			 Level 3 ( Win )
		 Out
			 Level 3 ( Win )


### 2.3. Make prediction with C4.5 and find the Confusion Matrix

In [108]:
TP, FP, TN, FN = GetConfusionMatrix(testInstances)

['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'In', 'NBC'] Actual label: Lose | Predicted label: Win
['Away', 'Out', 'ESPN'] Actual label: Win | Predicted label: Win
['Away', 'Out', 'FOX'] Actual label: Win | Predicted label: Lose
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'Out', 'ABC'] Actual label: Win | Predicted label: Win
['Home', 'In', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'In', 'ABC'] Actual label: Lose | Predicted label: Lose
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'In', 'ABC'] Actual label: Lose | Predicted label: Lose


### 2.4. Calculate Accuracy, Precision, Recall, and F1 score

In [109]:
GetMetrics(TP, FP, TN, FN)

Confusion matrix:
	 ---------------
	 | TP 8 | FN 1 |
	 | FP 1 | TN 2 |
	 ---------------
Accuracy: 0.8333
Precision: 0.8889
Recall: 0.8889
F1: 0.8889


## 3. Naive Bayes with NO zero correction

### 3.1. Calculate frequency distribution of (a) labels and (b) feature values under a particular label, in the training data.

In [162]:
label2freq = {}
label2value2freq_list = [{} for i in range(nFeatures)]

for instanceID in range(nTrainingInstances):
    
    featureValues,label = trainingInstances[instanceID]
    if not label in label2freq:
        label2freq[label] = 0
    label2freq[label] += 1
    
    for featureID in range(nFeatures):
        value = featureValues[featureID]        
        if not label in label2value2freq_list[featureID]:
            label2value2freq_list[featureID][label] = {}
        if not value in label2value2freq_list[featureID][label]:
            label2value2freq_list[featureID][label][value] = 0
        label2value2freq_list[featureID][label][value] += 1
print(label2value2freq_list[featureID][label][value],featureID,label,value)
print(label2value2freq_list)


4 2 Lose ABC
[{'Win': {'Home': 10, 'Away': 4}, 'Lose': {'Away': 5, 'Home': 5}}, {'Win': {'Out': 12, 'In': 2}, 'Lose': {'In': 5, 'Out': 5}}, {'Win': {'NBC': 10, 'ABC': 3, 'ESPN': 1}, 'Lose': {'ABC': 4, 'FOX': 1, 'NBC': 4, 'CBS': 1}}]


### 3.2. Calculate P(H): Prior probability

In [180]:
label2prob = {}
total = (label2freq['Lose']+label2freq['Win'])
for label, freq in label2freq.items():
    ### TODO 8: label2prob[label] = ________________ ###
    label2prob[label] = (label2freq[label]/total)
print(label2prob[label])



0.4166666666666667


### 3.3. Calculate P(X|H): Likelihood

In [181]:
label2value2prob_list = [{} for i in range(nFeatures)]

for featureID in range(nFeatures):
    for label in label2prob:

             
        label2value2prob_list[featureID][label] = {}

        ### TODO 9: for each value, label2value2prob_list[featureID][label][value] = ________________ ###

        each_featvalues = list(label2value2freq_list[featureID][label].keys())
        for value in each_featvalues:
            label2value2prob_list[featureID][label][value] = label2value2freq_list[featureID][label][value] / label2freq[label]            

### 3.4. Define a function to predict the label based on features by performing Bayesian Inference with training instances

In [182]:
def PredictBayes(trainingInstances, featureValues):
    
    label2posteriori = {}
    
    for label, prob in label2prob.items():
        
        posteriori = prob # Prior probability P(H)
        
        for featureID in range(nFeatures):
            
            if not label in label2value2prob_list[featureID]:
                posteriori = 0
                break
                
            value = featureValues[featureID]
            
            if not value in label2value2prob_list[featureID][label]:
                posteriori = 0
                break
                
            prob_likelihood = label2value2prob_list[featureID][label][value]
            
            ### TODO 10: posteriori = ________________ ###
            posteriori = prob_likelihood * label2prob[label]

        label2posteriori[label] = posteriori
        
    label_posteriori = sorted(label2posteriori.items(), key=lambda x:-x[1])
    
    return label_posteriori[0][0]

### 3.5. Define a function to make prediction for each test instance in the set and calculate the Confusion Matrix

In [183]:
def GetConfusionMatrixBayes(testInstances):
    
    TP, FP, TN, FN = 0, 0, 0, 0

    for featureValues, truthLabel in testInstances:

        predictLabel = PredictBayes(trainingInstances, featureValues)

        print(featureValues, "Actual label:", truthLabel, "| Predicted label:", predictLabel)

        ### TODO (the same as TODO 6): ###
        ### if ________________: TP += 1 ###
        if truthLabel == 'Win' and predictLabel == 'Win': TP+=1

        ### if ________________: FP += 1 ###
        if truthLabel == 'Lose' and predictLabel == 'Win': FP+=1

        ### if ________________: FN += 1 ###
        if truthLabel == 'Win' and predictLabel == 'Lose': FN+=1

        ### if ________________: TN += 1 ###
        if truthLabel == 'Lose' and predictLabel == 'Lose': TN+=1

    return TP, FP, TN, FN

### 3.6. Make prediction using Naive Bayes and find the Confusion Matrix

In [184]:
TP, FP, TN, FN = GetConfusionMatrixBayes(testInstances)

['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'In', 'NBC'] Actual label: Lose | Predicted label: Win
['Away', 'Out', 'ESPN'] Actual label: Win | Predicted label: Win
['Away', 'Out', 'FOX'] Actual label: Win | Predicted label: Lose
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'Out', 'ABC'] Actual label: Win | Predicted label: Lose
['Home', 'In', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'In', 'ABC'] Actual label: Lose | Predicted label: Lose
['Home', 'Out', 'NBC'] Actual label: Win | Predicted label: Win
['Away', 'In', 'ABC'] Actual label: Lose | Predicted label: Lose


### 3.7. Calculate Accuracy, Precision, Recall, and F1 score

In [185]:
GetMetrics(TP, FP, TN, FN)

Confusion matrix:
	 ---------------
	 | TP 7 | FN 2 |
	 | FP 1 | TN 2 |
	 ---------------
Accuracy: 0.75
Precision: 0.875
Recall: 0.7778
F1: 0.8235
