In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('A3/train.csv', index_col = 'ArticleId')
X = df['Text']
df['Category'].replace(['tech', 'entertainment'], [0,1], inplace = True)
Y = df['Category']

df_test = pd.read_csv('A3/test.csv', index_col = 'ArticleId')
X_test = df_test['Text']
df_test['Category'].replace(['tech', 'entertainment'], [0,1], inplace = True)
Y_test = df_test['Category']

temp_complete_test = pd.concat([X, X_test])

#print(df)
#print(df_test)
#print(temp_complete_test)


In [None]:
# Frequency count for dataset
vectorizer = CountVectorizer()
vectorizer.fit(temp_complete_test)

# Encode document
vector = vectorizer.transform(temp_complete_test)
prep_X_count = pd.DataFrame(vector.toarray(), columns = sorted(vectorizer.vocabulary_), index = temp_complete_test.index)
#print(prep_X_count)

prep_X_train_count = prep_X_count[:428]
prep_X_test_count = prep_X_count[428:]

print(prep_X_train_count)
#print(prep_X_test_count)

train_sum = prep_X_train_count.sum().sort_values(ascending = False)
test_sum = prep_X_test_count.sum().sort_values(ascending=False)

#print(train_sum)
#print(test_sum)

train_top_50 = train_sum.head(50)
#print(train_top_50)
sns.set_style('whitegrid')
sns.set(rc = {'figure.figsize':(20,8)})
plot = sns.barplot(x=train_top_50.index, y=train_top_50.values)
plot.set_xlabel('Top 50 words')
plot.set_ylabel('Word count')
plot.set_title('Term frequency in training set')
plt.xticks(rotation = 90)
plt.show()

In [None]:
train_label_group = Y.groupby(Y) #Group response class label
train_tech, train_entertain = train_label_group.get_group(0).index, train_label_group.get_group(1).index

#print(train_tech)
#print(train_entertain)

train_tech_sum = prep_X_count.loc[train_tech].sum().sort_values(ascending = False)
train_entertain_sum = prep_X_count.loc[train_entertain].sum().sort_values(ascending=False)

#print(train_tech_sum)
#print(train_entertain_sum)

train_top_50_tech = train_tech_sum.head(50)
train_top_50_entertain = train_entertain_sum.head(50)

#print((train_top_50_entertain))
#print((train_top_50_entertain))

fig, ax = plt.subplots(ncols=2, figsize=(25, 8))
ax[0] = sns.barplot(x=train_top_50_tech.index, y=train_top_50_tech.values, ax=ax[0])
ax[0].set_xlabel('Top 50 words')
ax[0].set_ylabel('Word count')
ax[0].set_title('Term frequency in tech category')
ax[0].tick_params(labelrotation=90)

ax[1] = sns.barplot(x=train_top_50_entertain.index, y=train_top_50_entertain.values, ax=ax[1])
ax[1].set_xlabel('Top 50 words')
ax[1].set_ylabel('Word count')
ax[1].set_title('Term frequency in entertainment category')
ax[1].tick_params(labelrotation=90)
plt.show()

In [None]:
sns.set(rc = {'figure.figsize':(10,5)})
#print(train_tech.size)
#print(train_entertain.size)
plot = sns.barplot(x=['Tech', 'Entertainment'], y=[train_tech.size, train_entertain.size])
plot.set_xlabel('Articles')
plot.set_ylabel('Article count')
plot.set_title('Class distribution')
plt.show()

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
# summarize
print(f'vector vocabulary - {vectorizer.vocabulary_}\n')

# encode document
vector = vectorizer.transform(X)
""" print(f'features\n {vectorizer.get_feature_names_out()}\n')
# summarize encoded vector
print(f'vector shape: {vector.shape}\n')
print(f'article vector\n {vector.toarray()}') """

prep_X = pd.DataFrame(vector.toarray(), columns=sorted(vectorizer.vocabulary_), index=X.index)
print(prep_X.shape)


In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(temp_complete_test)
# summarize
# print(f'vector vocabulary - {vectorizer.vocabulary_}\n')

# encode document
vector = vectorizer.transform(temp_complete_test)
""" print(f'features\n {vectorizer.get_feature_names_out()}\n')
# summarize encoded vector
print(f'vector shape: {vector.shape}\n')
print(f'article vector\n {vector.toarray()}') """

temp_complete_test_prep = pd.DataFrame(vector.toarray(), columns=sorted(vectorizer.vocabulary_), index=temp_complete_test.index)
print(temp_complete_test_prep.head())

prep_X_train = temp_complete_test_prep[:428]
prep_X_test = temp_complete_test_prep[428:]
#print(prep_X_test.shape)
#print(prep_X_train.shape)

# Task 2(a)

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
import math

# alpha = 1.0 ==> Laplace Smoothing; not perfectly balanced data ==> fit_prior = TRUE
nb_classifier = MultinomialNB(alpha = 1.0, fit_prior = True)

# Fit data to classifier
nb_classifier.fit(prep_X_train_count, Y)
#print(type(prep_X_train_count))

# Get log-likelihood of features given a class, i.e. P(x_i|y)
log_like_class0 = nb_classifier.feature_log_prob_[0]
log_like_class1 = nb_classifier.feature_log_prob_[1]

# Get indices of likelihood given class, from largest likelihood to smallest
# By negation rule, smallest items are originally the largest
max_index_class0 = np.argsort(-log_like_class0)
max_index_class1 = np.argsort(-log_like_class1)

# Top-20 most identifiable words over class 0
top20_class0 = np.array(prep_X_train_count.columns)[max_index_class0][0:20]

# Top-20 most identifiable words over class 1
top20_class1 = np.array(prep_X_train_count.columns)[max_index_class1][0:20]

print(top20_class0)
print(top20_class1)

# Calculate the values of the ratio between likelihoods for class 0
# P(x_i|y = 0) / P(x_i|y = 1)
ratio_class0 = np.exp(log_like_class0) / np.exp(log_like_class1)

# Calculate the values of the ratio between likelihoods for class 1
# P(x_i|y = 1) / P(x_i|y = 0)
ratio_class1 = np.exp(log_like_class1) / np.exp(log_like_class0)

# Sort indices of ratios, from largest ratio to smallest
# By negation rule, smallest items are originally the largest
max_index_ratio_class0 = np.argsort(-ratio_class0)
max_index_ratio_class1 = np.argsort(-ratio_class1)

# Top-20 words that maximise the quantity
top20_ratio_class0 = np.array(prep_X_train_count.columns)[max_index_ratio_class0][0:20]
top20_ratio_class1 = np.array(prep_X_train_count.columns)[max_index_ratio_class1][0:20]

print(top20_ratio_class0)
print(top20_ratio_class1)


##########################################################################
#                              TO BE DELETED                             #
##########################################################################
# Test code (Get parameters of the classifier)
#nb_classifier.get_params(deep=True)

# Test code (Prediction)
#print(nb_classifier.predict(prep_X_test_count))

# Get 
#nb_classifier.score(prep_X_test_count, Y_test)

# [instance 1:[probability of 0  probability of 1]]
# [instance 2:[probability of 0 probability of 1]]
#print(nb_classifier.predict_proba(prep_X_test_count))
##########################################################################
#                              TO BE DELETED                             #
##########################################################################

## (i)
The top-20 most identifiable words that are most likely to occur inthe articles over class ```tech``` and ```entertainment``` is:
|Rank|tech| |Rank| entertainment |
|:---:|:---:|---|:---:|:---:|
|1|said||1|said|
|2|people||2|film|
|3|new||3|best|
|4|mobile||4|year|
|5|mr||5|music|
|6|one||6|also|
|7|also||7|us|
|8|would||8|new|
|9|could||9|one|
|10|technology||10|show|
|11|use||11|first|
|12|users||12|awards|
|13|net||13|tv|
|14|software||14|last|
|15|games||15|uk|
|16|us||16|actor|
|17|music||17|number|
|18|many||18|band|
|19|year||19|mr|
|20|phone||20|star|

Note that in the code, we can compare the likelihood on log scale because it is monotonic.

## (ii)
The top-20 words that maximise the following quantity
$\begin{align*}
\frac{\mathbb{P}(X_w = 1|Y = y)}{\mathbb{P}(X_w = 1|Y \neq y)}
\end{align*}$:

|Rank|Y = tech| |Rank|Y = entertainment|
|:---:|:---:|---|:---:|:---:|
|1|users||1|actress|
|2|software||2|singer|
|3|microsoft||3|oscar|
|4|mobile||4|stars|
|5|broadband||5|aviator|
|6|virus||6|band|
|7|firms||7|nominated|
|8|pc||8|rock|
|9|spam||9|festival|
|10|phones||10|album|
|11|gadget||11|nominations|
|12|net||12|charles|
|13|consumer||13|chart|
|14|mobiles||14|foxx|
|15|gadgets||15|oscars|
|16|machines||16|starring|
|17|windows||17|singles|
|18|technologies||18|jamie|
|19|systems||19|swank|
|20|pcs||20|comedy|

The (ii) top-20 words that maximise the quantity is better than the (i) top-20 most identifiable words that are most likely to occur over 2 classes. It is because (i) calculate the corresponding likelihood, but not comparing the likelihood to that of other classes. We do not know whether the word will be more likely to be in class `tech` or `entertainment`. In contrast, (ii) actually calculates the relative likelihood. If the ratio is greater than 1, it means that the word will be more likely to be in the numerator class than that of the denominator class. Therefore, the lists of (ii) will better decribe the class.