In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel("data/data.xlsx")

df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,< = 30,high,no,fair,no
1,< = 30,high,no,excellent,no
2,31 ... 40,high,no,fair,yes
3,> 40,medium,no,fair,yes
4,> 40,low,yes,fair,yes
5,> 40,low,yes,excellent,no
6,31 ... 40,low,yes,excellent,yes
7,< = 30,medium,no,fair,no
8,< = 30,low,yes,fair,yes
9,> 40,medium,yes,fair,yes


In [2]:
# A1
# Calculate prior probability

print(f"There are {df['buys_computer'].nunique()} classes for this problem.")

pos = "yes"
neg = "no"

# Total number of instances
total = df.shape[0]

# Number of positive instances and negative instances
num_NO = len(df[df['buys_computer'] == neg])
num_YES = len(df[df['buys_computer'] == pos])

# Calculate probability 
print(f"Probability of P({pos}) is: {num_YES / total}")
print(f"Probability of P({neg}) is: {num_NO / total}")

There are 2 classes for this problem.
Probability of P(yes) is: 0.6428571428571429
Probability of P(no) is: 0.35714285714285715


In [3]:
# A2
# Calculate class conditional density for various features and classes

# # Class conditional density table for given attribute
def class_conditional_density_table(df, feature):
    # Seperate dataframe based on positive and negative labels
    pos_data = df[df["buys_computer"] == pos]
    neg_data = df[df["buys_computer"] == neg]

    # List of unique class labels of selected feature
    unique = list(df[feature].unique())
    # Count of unique features
    count = len(unique)

    # List of feature class count
    feature_class_count = []
    for i in unique:
        feature_class_count.append(len(neg_data[neg_data[feature] == i]))

    for i in unique:
        feature_class_count.append(len(pos_data[pos_data[feature] == i]))

    # List of conditional probability for corresponding feature class count
    conditional_probability = []
    i = 0
    while i < len(feature_class_count):
        if i < count:
            conditional_probability.append(feature_class_count[i] / len(neg_data))
        else:
            conditional_probability.append(feature_class_count[i] / len(pos_data))
        i += 1

    # Initialise a dictionary for class conditional probability table
    data = {feature+" condition class/label": unique * 2,
            "Feature class count": feature_class_count,
            "Target class label": ["no"]*count + ["yes"]*count,
            "Target class count": [len(neg_data)]*count+[len(pos_data)]*count,
            "Conditional Probability": conditional_probability}

    # Initialize dataframe to present table
    temp = pd.DataFrame(data)
    return temp

In [4]:
# Feature: age
class_conditional_density_table(df, "age")

Unnamed: 0,age condition class/label,Feature class count,Target class label,Target class count,Conditional Probability
0,< = 30,3,no,5,0.6
1,31 ... 40,0,no,5,0.0
2,> 40,2,no,5,0.4
3,< = 30,2,yes,9,0.222222
4,31 ... 40,4,yes,9,0.444444
5,> 40,3,yes,9,0.333333


In [5]:
# Feature: income
class_conditional_density_table(df, "income")

Unnamed: 0,income condition class/label,Feature class count,Target class label,Target class count,Conditional Probability
0,high,2,no,5,0.4
1,medium,2,no,5,0.4
2,low,1,no,5,0.2
3,high,2,yes,9,0.222222
4,medium,4,yes,9,0.444444
5,low,3,yes,9,0.333333


In [6]:
# Feature: student
class_conditional_density_table(df, "student")

Unnamed: 0,student condition class/label,Feature class count,Target class label,Target class count,Conditional Probability
0,no,4,no,5,0.8
1,yes,1,no,5,0.2
2,no,3,yes,9,0.333333
3,yes,6,yes,9,0.666667


In [7]:
# Feature: credit_rating
class_conditional_density_table(df, "credit_rating")

Unnamed: 0,credit_rating condition class/label,Feature class count,Target class label,Target class count,Conditional Probability
0,fair,2,no,5,0.4
1,excellent,3,no,5,0.6
2,fair,6,yes,9,0.666667
3,excellent,3,yes,9,0.333333


In [8]:
# A3
# Test for independence between the 4 given features

# We use chi-square test for checking correlation
# https://medium.com/@ritesh.110587/correlation-between-categorical-variables-63f6bd9bf2f7
# Importing required libraries
import os as os
from itertools import product
import scipy.stats as ss

# Filtering out 4 features
df1 = df[df.columns[:4]]

# Split this list into two parts
var1 = ('age', 'income', 'student', 'credit_rating')
var2 = ('age', 'income', 'student', 'credit_rating')

# Create all possible combinations between the above two variables list
var_prod = list(product(var1, var2, repeat = 1))

# Performing chi-square test
result = []

for i in var_prod:
    if i[0] != i[1]:
        result.append((i[0],i[1],list(ss.chi2_contingency(pd.crosstab(df1[i[0]], df1[i[1]])))[1]))

chi_test_output = pd.DataFrame(result, columns = ['var1', 'var2', 'coeff'])

# Use a pivot function to convert the dataframe to a crosstab
chi_test_output.pivot(index='var1', columns='var2', values='coeff')
# There exists a relationship between two variables if p value <= 0.05

var2,age,credit_rating,income,student
var1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,,0.943335,0.504981,0.818731
credit_rating,0.943335,,0.694486,1.0
income,0.504981,0.694486,,0.058816
student,0.818731,1.0,0.058816,


In [9]:
# A4
# Make a Naive-Bayes classifier

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

# Convert to binary encoding using get dummies
dftemp = pd.get_dummies(df1)
# target attribute
target = df['buys_computer']

# Train the model
model.fit(dftemp, target)


In [10]:
# Accuracy score for part of dataset
model.score(dftemp.tail(3), target.tail(3))

1.0

In [11]:
# A5
# Text classification dataset
# MultinomialNB is better for classification with discrete features 
# (e.g. word counts for text classification)

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv("data/lab3.csv")
# Creating similar size samples of ham and spam
ham = df[df['Category'] == 'ham']
spam = df[df['Category'] == 'spam']
ham = ham.sample(spam.shape[0])

# Now combine these to form a dataset
dataset = pd.concat([ham, spam], ignore_index=True)

# Split for training and testing 
X_train, X_test, y_train, y_test = train_test_split(dataset['Message'], dataset['Category'], test_size=0.3)

# Convert to numeric vector format for model
# Integer feature counts work better than fractional counts like tf-idf
from sklearn.feature_extraction.text import CountVectorizer
ctvector = CountVectorizer()
X_train = ctvector.fit_transform(X_train).toarray()
X_test = ctvector.transform(X_test).toarray()

# MultinomialNB
multinb = MultinomialNB()

multinb.fit(X_train, y_train)

In [12]:
multinb.score(X_test, y_test)

0.9732739420935412