# Module loading

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
from IPython.core.pylabtools import figsize
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
import nltk
import string
from string import punctuation
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.utils.extmath import randomized_svd
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from sklearn.metrics import auc, roc_curve, plot_roc_curve, plot_confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn import datasets, metrics, model_selection, svm
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline
from tempfile import mkdtemp
from joblib import Memory
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

In [None]:
pwd = !pwd
data_path = str(pwd[0]) + '/data.csv'
df = pd.read_csv(data_path)
df.head()

# Question 1

In [None]:
df.shape

### How many rows (samples) and columns (features) are present in the dataset?
- There are 3150 rows and 8 features presented in the dataset.

In [None]:
df['totalwords'] = df['full_text'].str.split().str.len()
df['totalwords'].head()

### The total number of alpha-numeric characters per data point (row) in the feature full text

In [None]:
# (a) The total number of alpha-numeric characters per data point (row) in the feature full text
figsize(9,5)
nparr=df['totalwords'].to_numpy()
values, counts = np.unique(nparr, return_counts=True)
plt.bar(values,counts, color='blue', edgecolor = 'black')
plt.ylabel('frequencies')
plt.xlabel('counts')
plt.title('The total number of alpha-numeric characters per data point (row) in the feature full text')

### The column leaf label – class on the x-axis

In [None]:
#(b) The column leaf label – class on the x-axis
nparr=df['leaf_label'].to_numpy()
values, counts = np.unique(nparr, return_counts=True)
figsize(12,5)

plt.bar(values,counts, color='blue', edgecolor = 'black')
plt.ylabel('frequencies')
plt.xlabel('counts')
plt.title('With column leaf label on the x-axis')

### The column root label – class on the x-axis

In [None]:
#(c) The column root label – class on the x-axis.
#• Interpret Plots: Provide qualitative interpretations of the histograms.
nparr=df['root_label'].to_numpy()
values, counts = np.unique(nparr, return_counts=True)
figsize(12,5)

plt.bar(values,counts, color='blue', edgecolor = 'black')
plt.ylabel('frequencies')
plt.xlabel('counts')
plt.title('With column root label on the x-axis')

### Qualitative interpretations:
- From the above graphs, we conclude that while we have an equal number of samples(rows) for each leaf sample.
- On the other hand, there's a slight difference between the number of samples for the columns of root label, which stems from the root label "sports" having one extra leaf label.
- Another observation is that the variance of word counts between different samples is quite high, with some outliers at over 7000 words.

# Question 2


In [None]:
train, test = train_test_split(df[["full_text","root_label"]], test_size=0.2)
#checking text after cleaning
train['full_text'].head()

In [None]:
np.random.seed(42)
random.seed(42)
#define cleaning function
def clean(text):
 text = re.sub (r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
 texter = re.sub(r"<br />", " ", text)
 texter = re.sub(r"&quot;", "\"",texter)
 texter = re.sub('&#39;', "\"", texter)
 texter = re.sub('\n', " ", texter)
 texter = re.sub(' u '," you ", texter)
 texter = re.sub('`',"", texter)
 texter = re.sub(' +', ' ', texter)
 texter = re.sub(r"(!)\1+", r"!", texter)
 texter = re.sub(r"(\?)\1+", r"?", texter)
 texter = re.sub('&amp;', 'and', texter)
 texter = re.sub('\r', ' ',texter)
 clean = re.compile('<.*?>')
 texter = texter.encode('ascii', 'ignore').decode('ascii')
 texter = re.sub(clean, '', texter)
 if texter == "":
  texter = ""
 return texter

#clean raw text
for i in df['full_text']:
  clean(i)
df['full_text'] = df['full_text'].str.replace('\d+', '') # for digits
df['full_text'] = df['full_text'].str.replace('[^\w\s]', '') # for punctuation 

print("\n")
print("Training samples size:", train.shape)
print("Testing samples size:", test.shape)

### Report the number of training and testing samples.

A:

The training samples contain 2520 rows and 2 features.

The testing samples contain 630 rows and 2 features.

# Question 3

In [None]:
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
def lemmatize_sent(text): 
    # Text input is string.
    # Returns sequence of lowercased strings(words).
    lemma_list = []
    lemma_list = [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(nltk.word_tokenize(text))]
    # Turn array of strings into sequence
    return ' '.join(lemma_list)


In [None]:
wnl = WordNetLemmatizer()
vec = CountVectorizer(stop_words='english', min_df=3)
tfidf = TfidfTransformer()

X_train_lemma = []
X_test_lemma = []
for i in range(len(train)):
  # print(lemmatize_sent(train.iloc[i]['full_text']))
  X_train_lemma.append(lemmatize_sent(train.iloc[i]['full_text']))
for i in range(len(test)):
  X_test_lemma.append(lemmatize_sent(test.iloc[i]['full_text']))

X_train_vec = vec.fit_transform(X_train_lemma)
X_test_vec = vec.transform(X_test_lemma)
X_train_tfidf = tfidf.fit_transform(X_train_vec)
X_test_tfidf = tfidf.transform(X_test_vec)
print("X_train shape:", X_train_tfidf.shape)
print("X_test shape:", X_test_tfidf.shape)

### What are the pros and cons of lemmatization versus stemming? How do these processes affect the dictionary size?

A: 
The main advantage of lemmatization is that it takes into consideration the context of the word to determine which is the intended meaning the user is looking for. This process allows to decrease noise and speed up the user’s task. However,because lemmatization involves deriving the meaning of a word from something like a dictionary, it's very time consuming.

Stemming usually refers to a process of chopping off the last few characters. Stemming operates on a single word without knowledge of the context. Stemming is not a well-defined process, it often suffers from incorrect meaning and spelling errors.

### min df means minimum document frequency. How does varying min df change the TF-IDF matrix?

A: 

When we increase the min df, it will decrease the column counts of the matrix, because the goal of MIN_DF is to ignore words that have very few occurrences to be considered meaningful. 

### Should I remove stopwords before or after lemmatizing? Should I remove punctuations before or after lemmatizing? Should I remove numbers before or after lemmatizing?

A: 

We should remove stopwords after lemmatizing but remove punctuations before lemmatizing. Because in certain cases, stop words do indeed contribute meaning, and if an application is sensitive to such meanings, then stop words should not be eliminated.

### Report the shape of the TF-IDF-processed train and test matrices. The number of rows should match the results of Question 2. The number of columns should roughly be in the order of $k × 10^3 $.

A: 

The train matrices contains 2520 rows and 14224 columns and the test set contains 630 rows and 14224 columns.

# Question 4

In [None]:
ks = [1, 10, 50, 100, 200, 500, 1000, 2000]
ratios = []
for i in range(len(ks)):
  svd_tmp = TruncatedSVD(n_components=ks[i], random_state=42)
  # print(svd_tmp.explained_variance_ratio_)
  svd_tmp.fit(X_train_tfidf)
  print("explained_variance_ratio when k =", ks[i], ":", svd_tmp.explained_variance_ratio_.sum())
  ratios.append(svd_tmp.explained_variance_ratio_.sum())


fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(ks, ratios)

ax.set_ylabel('explained_variance_ratio')
ax.set_xlabel('k')
plt.title('explained_variance_ratio versus different k')
plt.show()

svd = TruncatedSVD(n_components=50, random_state=42)
nmf = NMF(n_components=50, init='random', random_state=42)

print("\n")
# LSI
X_train_LSI = svd.fit_transform(X_train_tfidf)
X_test_LSI = svd.transform(X_test_tfidf)
# NMF
X_train_NMF = nmf.fit_transform(X_train_tfidf)
X_test_NMF = nmf.transform(X_test_tfidf)
# W = X_test_xxx
# H = nmf.components_

# data shape
print("LSI:")
print('training data shape = ', X_train_LSI.shape)
print('test data shape = ', X_test_LSI.shape)
print("NMF:")
print('training data shape = ', X_train_NMF.shape)
print('test data shape = ', X_test_NMF.shape)


# calculate the reconstruction residual MSE error
train_U, train_S, train_V = randomized_svd(X_train_tfidf, n_components=50, random_state=42)
test_U, test_S, test_V = randomized_svd(X_test_tfidf, n_components=50, random_state=42)
print("\n")
print("the reconstruction residual MSE error:")
print("When using LSI:")
print('training error:',np.sum(np.array(X_train_tfidf - (train_U.dot(np.diag(train_S)).dot(train_V)))**2))
print('test error:',np.sum(np.array(X_test_tfidf - (test_U.dot(np.diag(test_S)).dot(test_V)))**2))

print("When using NMF:")
print('training error:',np.sum(np.array(X_train_tfidf - X_train_NMF.dot(nmf.components_))**2))
print('test error:',np.sum(np.array(X_test_tfidf - X_test_NMF.dot(nmf.components_))**2))


### What does the explained variance ratio plot look like? What does the plot’s concavity suggest?

A:

The plot looks like a concave down durve. As k increases, the explained variance ratio getting larger. In which makes sense since the ratio reflects how much variation of the target value(Y) is explained by the data features(X).

### With k = 50 found in the previous sections, calculate the reconstruction residual MSE error when using LSI and NMF – they both should use the same k = 50. Which one is larger, the $∥X − WH∥_2^F$ in NMF or the $∥X − U_kΣ_kV^T_k∥^F_2$ in LSI and why?

A:

As you can see, both the training error and test error in NMF are larger than those in LSI. 

It makes sense because LSI is a more insightful method by giving us more information with the SVD factorization. Therefore, its mse error will be lower than NMF as we expect.

# Question 5

In [None]:
map_root = {"sports":1, "climate":0}
y_train = train['root_label'].map(map_root)
y_test = test['root_label'].map(map_root)

In [None]:
def print_roc_curve(X_test_LSI, y_test, model, plt_title, colar):
  fig, ax = plt.subplots()
  plot_roc_curve(model, X_test_LSI, y_test, ax=ax, color=colar)
  ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.5)
  plt.title(plt_title)
  plt.show()

def print_cf_matrix(X_test_LSI, y_test, model, plt_title, display_labels):
  plot_confusion_matrix(model, X_test_LSI, y_test, display_labels=display_labels)
  plt.tight_layout()
  plt.title(plt_title)
  # plt.savefig(image_name)
  plt.show()

def print_result(model_title, y_test, y_pred):
  print(model_title, ":")
  print("Accuracy:", accuracy_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred))
  print("Precision:", precision_score(y_test, y_pred))
  print("F1-Score:", f1_score(y_test, y_pred), "\n")

def print_multi_result(model_title, y_test, y_pred):
  print(model_title, ":")
  print("Accuracy:", accuracy_score(y_test, y_pred))
  print("Recall:", recall_score(y_test, y_pred, average='weighted'))
  print("Precision:", precision_score(y_test, y_pred, average='weighted'))
  print("F1-Score:", f1_score(y_test, y_pred, average='weighted'), "\n")

root_labels = ['climate', 'sports']

### Train one SVM with γ = 1000 (hard margin), another with γ = 0.0001 (soft margin)

In [None]:
# build the svm with given gamma
hard_svm = svm.SVC(kernel='linear', C=1000, random_state=42)
soft_svm = svm.SVC(kernel='linear', C=0.0001, random_state=42)
harder_svm = svm.SVC(kernel='linear', C=100000, random_state=42)

# fit the model
hard_svm.fit(X_train_LSI, y_train)
soft_svm.fit(X_train_LSI, y_train)
harder_svm.fit(X_train_LSI, y_train)

# predict the test data
hard_y_pred = hard_svm.predict(X_test_LSI)
soft_y_pred = soft_svm.predict(X_test_LSI)
harder_y_pred = harder_svm.predict(X_test_LSI)

###  Plot the ROC curve, report the confusion matrix and calculate the accuracy, recall, precision and F-1 score of both SVM classifiers on the testing set. Which one performs better? What about for γ = 100000?

A:

As you can find in the following sections, the hard margin one performs better in this dataset. And the one with γ = 100000 performs even better than the hard margin one a little bit.

In [None]:
print_roc_curve(X_test_LSI, y_test, hard_svm, 'ROC curve for the hard margin SVM', 'b')
print_roc_curve(X_test_LSI, y_test, soft_svm, 'ROC curve for the soft margin SVM', 'g')
print_roc_curve(X_test_LSI, y_test, harder_svm, 'ROC curve for the SVM with gamma = 100000', 'g')

In [None]:
print_cf_matrix(X_test_LSI, y_test, hard_svm, 'Hard SVM', display_labels=root_labels)
print_cf_matrix(X_test_LSI, y_test, soft_svm, 'Soft SVM', display_labels=root_labels)
print_cf_matrix(X_test_LSI, y_test, harder_svm, 'SVM with gamma = 100000', display_labels=root_labels)

In [None]:
print_result("Hard SVM", y_test, hard_y_pred)
print_result("Soft SVM", y_test, soft_y_pred)
print_result("SVM with gamma = 100000", y_test, harder_y_pred)

### What happens for the soft margin SVM? Why is the case? Analyze in terms of the confusion matrix

A:

From the confusion matrix of soft margin SVM, you can find that it predicts all the test data into the "sports" category. In which might because of its margin is too small to allow the model classify different data and makes the misclassification of data not being punished.

#### Does the ROC curve reflect the performance of the soft-margin SVM? Why?

A:

No. It doesn't reflect the performance.

The ROC curve is a plot of True Positive Rate (TPR) on the y-axis vs. False Positive Rate (FPR) on the x-axis.

$TPR = \frac{True Positive}{True Positive + False Negative}$

$FPR = \frac{False Positive}{False Positive + True Negative}$

Since the model predicts all the test data into the "sports" label, you can find out that the TPR and FPR are all **equal to 1** in this situation.
Therefore, the ROC curve is not useful to evaluate the performance of the soft-margin SVM.

### Use cross-validation to choose γ (use average validation 3 accuracy to compare): Using a 5-fold cross-validation, find the best value of the parameter γ in the range {10k | − 3 ≤ k ≤ 6, k ∈ Z}. Again, plot the ROC curve and report the confusion matrix and calculate the accuracy, recall precision and F-1 score of this best SVM.

After using the cross-validation, we can find out the best γ to use here is 100.

In [None]:
svc = svm.SVC(random_state=42)
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000], 'kernel': ['linear']}

clf = GridSearchCV(svc, parameters, cv=5, scoring='accuracy')
clf.fit(X_train_LSI, y_train)
best_y_pred = clf.best_estimator_.predict(X_test_LSI)

In [None]:
print('Best Value of γ:',clf.best_params_['C']) 
for l, n in zip(parameters['C'], clf.cv_results_['mean_test_score']):
    print(f'γ: {l}\t',f'average validation accuracy: {n}')
print_roc_curve(X_test_LSI, y_test, clf.best_estimator_, 'ROC curve for the BEST SVM', 'b')
print_cf_matrix(X_test_LSI, y_test, clf.best_estimator_, 'BEST SVM', display_labels=root_labels)
print_result("BEST SVM", y_test, best_y_pred)

# Question 6

### Train a logistic classifier without regularization (you may need to come up with some way to approximate this if you use sklearn.linear model.LogisticRegression); plot the ROC curve and report the confusion matrix and calculate the accuracy, recall precision and F-1 score of this classifier on the testing set.

In [None]:
log_wor = LogisticRegression(penalty='none', random_state=9527, max_iter=100000)
log_wor.fit(X_train_LSI, y_train)
log_wor_y_pred = log_wor.predict(X_test_LSI)

In [None]:
print_roc_curve(X_test_LSI, y_test, log_wor, 'ROC curve for the Logistic Regression without regularization', 'b')
print_cf_matrix(X_test_LSI, y_test, log_wor, 'Logistic Regression without regularization', display_labels=root_labels)
print_result("Logistic Regression without regularization", y_test, log_wor_y_pred)

### Using 5-fold cross-validation on the dimension-reduced-by-SVD training data, find the optimal regularization strength in the range {10k | −5 ≤ k ≤ 5, k ∈ Z} for logistic regression with L1 regularization and logistic regression with L2 regularization, respectively.

In [None]:
# L1 regularization
log_l1 = LogisticRegression(penalty='l1', random_state=9527, solver='liblinear', max_iter=100000)
parameters = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
clf_l1 = GridSearchCV(log_l1, parameters, cv=5, scoring='accuracy')
clf_l1.fit(X_train_LSI, y_train)
l1_best_y_pred = clf_l1.best_estimator_.predict(X_test_LSI)
# L2 regularization
log_l2 = LogisticRegression(penalty='l2', random_state=9527, solver='liblinear', max_iter=100000)
parameters = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
clf_l2 = GridSearchCV(log_l2, parameters, cv=5, scoring='accuracy')
clf_l2.fit(X_train_LSI, y_train)
l2_best_y_pred = clf_l2.best_estimator_.predict(X_test_LSI)

### Compare the performance (accuracy, precision, recall and F-1 score) of 3 logistic classifiers: w/o regularization, w/ L1 regularization and w/ L2 regularization (with the best parameters you found from the part above), using test data.

A:

When using the best parameter we found, we can find that we get the exact same performance of 3 different logistic classifiers.

In [None]:
print('Best Value of regularization strength in L1 regularization:',clf_l1.best_params_['C']) 
for l, n in zip(parameters['C'], clf_l1.cv_results_['mean_test_score']):
    print(f'reg. strength(C):{l}\t',f'average validation accuracy: {n}')

print('\nBest Value of regularization strength in L2 regularization:',clf_l2.best_params_['C']) 
for l, n in zip(parameters['C'], clf_l2.cv_results_['mean_test_score']):
    print(f'reg. strength(C):{l}\t',f'average validation accuracy: {n}')

In [None]:
print_result("Logistic Regression without regularization", y_test, log_wor_y_pred)
print_result("Logistic Regression with L1 regularization", y_test, l1_best_y_pred)
print_result("Logistic Regression with L2 regularization", y_test, l2_best_y_pred)

### How does the regularization parameter affect the test error? How are the learnt coefficients affected? Why might one be interested in each type of regularization?

A:

From our experiment, we didn't see much differences in the accuracy or error when using the best parameter in different regularization. However, the best parameter of l2 regularization is much higher than l1 regularization. 

We can think of that all these models can reach a limit when we find the best parameter to use. Therefore, the different bias (regularizations) we add to the model doesn't affect the performance.

### Both logistic regression and linear SVM are trying to classify data points using a linear decision boundary. What is the difference between their ways to find this boundary? Why do their performances differ? Is this difference statistically significant?

A:

Logistic regression maximizes the conditional probability likelihood to find the decision boundary. SVM use geometric and deterministic method to separate the hyperplane and find the vectors of the margin. 

Therefore, SVM can yield a deterministic hyperplane to improve accuracy and reduce error rate. In which makes it more generalized and efficient. On the other hand, logistic regression is more likely to overfitting since its decision is basically relied on the dataset. In which makes it get higher accuracy more easily but lose the generality instead.

Lastly, the difference in this dataset is not statistically significant since the difference is approximately 0.008. We believe there is no much difference when we can find the best parameters in each model.


# Question 7

### Evaluate and profile a Na¨ıve Bayes classifier: Train a GaussianNB classifier; plot the ROC curve and report the confusion matrix and calculate the accuracy, recall, precision and F-1 score of this classifier on the testing set.

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_LSI, y_train)
gnb_y_pred = gnb.predict(X_test_LSI)

In [None]:
print_roc_curve(X_test_LSI, y_test, gnb, 'ROC curve for the GaussianNB classifiers', 'b')
print_cf_matrix(X_test_LSI, y_test, gnb, 'GaussianNB classifiers', display_labels=root_labels)
print_result("GaussianNB classifiers", y_test, gnb_y_pred)

# Question 8

In [None]:
X_train = train['full_text']
print(X_train)

In [None]:
def stemming_sent(text): 
    # Text input is string.
    # Returns sequence of lowercased strings(words).
    porter = PorterStemmer()
    token_words = word_tokenize(text)
    stem_list = []
    for word in token_words:
        stem_list.append(porter.stem(word))
        stem_list.append(" ")
    # Turn array of strings into sequence
    return ' '.join(stem_list)

In [None]:
cachedir = mkdtemp()
memory = Memory(cachedir, verbose=87)
estimators = [
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', None),
    ('clf', None),
    ]
pipeline = Pipeline(estimators, memory=memory)
param_grid = [{
        'vect__min_df': [3,5],
        'vect__analyzer': [lemmatize_sent, stemming_sent],
        'reduce_dim': [TruncatedSVD(n_components=5, random_state=42),
                       TruncatedSVD(n_components=30, random_state=42),
                       TruncatedSVD(n_components=80, random_state=42), 
                       NMF(n_components=5, init='random', random_state=42),
                       NMF(n_components=30, init='random', random_state=42),
                       NMF(n_components=80, init='random', random_state=42)], 
        'clf': [svm.SVC(kernel='linear',C=100,random_state=42),
                LogisticRegression(penalty='l1', C=100, random_state=42, solver='liblinear',max_iter=100000),
                LogisticRegression(penalty='l2', C=100, random_state=42, solver='liblinear',max_iter=100000),
                GaussianNB()]
    }]
CV = GridSearchCV(pipeline, cv=5, param_grid=param_grid, scoring='accuracy')
CV.fit(X_train, y_train)

In [None]:
CV.best_params_

### What are the 5 best combinations? Report their performances on the testing set.

In [None]:
for id in range(0,len(CV.cv_results_["rank_test_score"])):
  if CV.cv_results_["rank_test_score"][id] <=5:
    print(CV.cv_results_["rank_test_score"][id])
    print(CV.cv_results_["params"][id])
    print(CV.cv_results_["mean_test_score"][id])

### What are the 5 best combinations? Report their performances on the testing set.

(Rank)

- 1 (3 models with same accuracy)

  {'clf': SVC(C=100, kernel='linear', random_state=42), 'reduce_dim': TruncatedSVD(n_components=30, random_state=42), 'vect__analyzer': <function stemming_sent at 0x0000020686664EE0>, 'vect__min_df': 3}

  {'clf': SVC(C=100, kernel='linear', random_state=42), 'reduce_dim': TruncatedSVD(n_components=30, random_state=42), 'vect__analyzer': <function stemming_sent at 0x0000020686664EE0>, 'vect__min_df': 5}

  {'clf': LogisticRegression(C=10, max_iter=100000, penalty='l1', random_state=42, solver='liblinear'), 'reduce_dim': TruncatedSVD(n_components=30, random_state=42), 'vect__analyzer': <function stemming_sent at 0x0000020686664EE0>, 'vect__min_df': 3}

  accuracy: 0.826984126984127

- 4

  {'clf': LogisticRegression(C=10, max_iter=100000, penalty='l1', random_state=42, solver='liblinear'), 'reduce_dim': TruncatedSVD(n_components=30, random_state=42), 'vect__analyzer': <function stemming_sent at 0x0000020686664EE0>, 'vect__min_df': 5}

  accuracy: 0.8265873015873015

- 5 (2 models with same accuracy)

  {'clf': LogisticRegression(C=100, max_iter=100000, random_state=42, solver='liblinear'), 'reduce_dim': TruncatedSVD(n_components=30, random_state=42), 'vect__analyzer': <function stemming_sent at 0x0000020686664EE0>, 'vect__min_df': 3}

  {'clf': LogisticRegression(C=100, max_iter=100000, random_state=42, solver='liblinear'), 'reduce_dim': TruncatedSVD(n_components=30, random_state=42), 'vect__analyzer': <function stemming_sent at 0x0000020686664EE0>, 'vect__min_df': 5}

  accuracy: 0.8218253968253968



# Question 9

In [None]:
train_multi, test_multi = train_test_split(df[["full_text","leaf_label"]], test_size=0.2)
df['full_text'].head()
print("train_multi shape:", train_multi.shape)
print("test_multi shape:", test_multi.shape)

In [None]:
X_train_multi = train_multi['full_text']
X_test_multi = test_multi['full_text']

wnl = WordNetLemmatizer()
vec = CountVectorizer(stop_words='english', min_df=3)
tfidf = TfidfTransformer()

X_train_lemma_m = []
X_test_lemma_m = []
for i in range(len(train_multi)):
  # print(lemmatize_sent(train.iloc[i]['full_text']))
  X_train_lemma_m.append(lemmatize_sent(train_multi.iloc[i]['full_text']))
for i in range(len(test_multi)):
  X_test_lemma_m.append(lemmatize_sent(test_multi.iloc[i]['full_text']))

X_train_vec_m = vec.fit_transform(X_train_lemma_m)
X_test_vec_m = vec.transform(X_test_lemma_m)
X_train_tfidf_m = tfidf.fit_transform(X_train_vec_m)
X_test_tfidf_m = tfidf.transform(X_test_vec_m)
svd = TruncatedSVD(n_components=50, random_state=42)
X_train_LSI_m = svd.fit_transform(X_train_tfidf_m)
X_test_LSI_m = svd.transform(X_test_tfidf_m)

### Perform Naive Bayes classification and multiclass SVM classification (with both One VS One and One VS the rest methods described above) and report the confusion matrix and calculate the accuracy, recall, precision and F-1 score of your classifiers. How did you resolve the class imbalance issue in the One VS the rest model?

- We use the library sklearn.multiclass to implement the one vs rest model. Basically, It takes the original binary svm model as input.

In [None]:
map_row_to_class = {"chess":0, "cricket":1, "hockey":2, "soccer":3,
"football":4, "%22forest%20fire%22":5, "flood":6, "earthquake":7,
"drought":8}
y_train_multi = train_multi['leaf_label'].map(map_row_to_class)
y_test_multi = test_multi['leaf_label'].map(map_row_to_class)

print(y_train_multi)

gnb_multi = GaussianNB()
gnb_multi.fit(X_train_LSI_m, y_train_multi)
gnb_multi_y_pred = gnb_multi.predict(X_test_LSI_m)

In [None]:
leaf_labels = ["chess", "cricket", "hockey", "soccer",
"football", "%22forest%20fire%22", "flood", "earthquake",
"drought"]

plot_confusion_matrix(gnb_multi, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with GaussianNB')
plt.show()

print_multi_result("Multiclass Classification with GaussianNB", y_test_multi, gnb_multi_y_pred)

In [None]:
svm_ovo = OneVsOneClassifier(svm.LinearSVC(random_state=42))
svm_ovr = OneVsRestClassifier(svm.LinearSVC(random_state=42))

# fit the model
svm_ovo.fit(X_train_LSI_m, y_train_multi)
svm_ovr.fit(X_train_LSI_m, y_train_multi)

# predict the test data
svm_ovo_pred = svm_ovo.predict(X_test_LSI_m)
svm_ovr_pred = svm_ovr.predict(X_test_LSI_m)

In [None]:
plot_confusion_matrix(svm_ovo, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with One VS One SVM')
plt.show()

plot_confusion_matrix(svm_ovr, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with One VS Rest SVM')
plt.show()

print_multi_result("Multiclass Classification with One VS One SVM", y_test_multi, svm_ovo_pred)
print_multi_result("Multiclass Classification with One VS Rest SVM", y_test_multi, svm_ovr_pred)

### Do you observe any structure in the confusion matrix? Are there distinct visible blocks on the major diagonal? What does this mean?

A:

Yes, there are distinct visible blocks on the major diagonal. In which means that the performance of this model should be not bad, it predict each labels correctly with a good accuracy.

### Based on your observation from the previous part, suggest a subset of labels that should be merged into a new larger label and recompute the accuracy and plot the confusion matrix. How did the accuracy change in One VS One and One VS the rest?

A:

Yes, we get a high error rate on the label **"soccer"** and **"football"** based on the performance. Maybe we can merge these two labels together.

After merging the labels, we can find that the accuracy get increased in both the One VS One and One VS the rest.

There is approximately a **0.1** increase in both the models.

In [None]:
map_row_to_class = {"chess":0, "cricket":1, "hockey":2, "soccer":3,
"football":3, "%22forest%20fire%22":4, "flood":5, "earthquake":6,
"drought":7}
y_train_multi = train_multi['leaf_label'].map(map_row_to_class)
y_test_multi = test_multi['leaf_label'].map(map_row_to_class)
print(y_train_multi)
gnb_multi = GaussianNB()
gnb_multi.fit(X_train_LSI_m, y_train_multi)
gnb_multi_y_pred = gnb_multi.predict(X_test_LSI_m)

In [None]:
leaf_labels = ["chess", "cricket", "hockey", "soccer and football",
"%22forest%20fire%22", "flood", "earthquake",
"drought"]
plot_confusion_matrix(gnb_multi, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with GaussianNB')
plt.show()

print_multi_result("Multiclass Classification with GaussianNB using new labels", y_test_multi, gnb_multi_y_pred)

In [None]:
svm_ovo = OneVsOneClassifier(svm.LinearSVC(random_state=42))
svm_ovr = OneVsRestClassifier(svm.LinearSVC(random_state=42))

# fit the model
svm_ovo.fit(X_train_LSI_m, y_train_multi)
svm_ovr.fit(X_train_LSI_m, y_train_multi)

# predict the test data
svm_ovo_pred = svm_ovo.predict(X_test_LSI_m)
svm_ovr_pred = svm_ovr.predict(X_test_LSI_m)

In [None]:
plot_confusion_matrix(svm_ovo, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with One VS One SVM [new labels]')
plt.show()

plot_confusion_matrix(svm_ovr, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with One VS Rest SVM [new labels]')
plt.show()

print_multi_result("Multiclass Classification with One VS One SVM [new labels]", y_test_multi, svm_ovo_pred)
print_multi_result("Multiclass Classification with One VS Rest SVM [new labels]", y_test_multi, svm_ovr_pred)

### Does class imbalance impact the performance of the classification once some classes are merged? Provide a resolution for the class imbalance and recompute the accuracy and plot the confusion matrix in One VS One and One VS the rest?.

A:

We set the parameter **class_weight='balanced'** from the sklearn.svm.LinearSVC. It assigns the weights of each class label so that we can implement the balanced class we want.

As you can see, the accuracy of one vs rest gets increasing after dealing with the imbalance issue. However, the accuracy of one vs one doesn't get improved after dealing with the imbalance issue.

There are many interpretations to explain it. One of the possible explanation might be the model already got a great accuracy which didn’t get influenced by the imbalanced class a lot. Therefore, dealing with the issue doesn't help us get a huge improve of the models. It only helps the one vs rest model get improved a little bit.

In [None]:
svm_ovo = OneVsOneClassifier(svm.LinearSVC(random_state=42, class_weight='balanced'))
svm_ovr = OneVsRestClassifier(svm.LinearSVC(random_state=42, class_weight='balanced'))

# fit the model
svm_ovo.fit(X_train_LSI_m, y_train_multi)
svm_ovr.fit(X_train_LSI_m, y_train_multi)

# predict the test data
svm_ovo_pred = svm_ovo.predict(X_test_LSI_m)
svm_ovr_pred = svm_ovr.predict(X_test_LSI_m)

In [None]:
plot_confusion_matrix(svm_ovo, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with One VS One SVM [class balanced]')
plt.show()

plot_confusion_matrix(svm_ovr, X_test_LSI_m, y_test_multi, display_labels=leaf_labels)
plt.xticks(rotation=90)
plt.title('Multiclass Classification with One VS Rest SVM [class balanced]')
plt.show()

print_multi_result("Multiclass Classification with One VS One SVM [class balanced]", y_test_multi, svm_ovo_pred)
print_multi_result("Multiclass Classification with One VS Rest SVM [class balanced]", y_test_multi, svm_ovr_pred)

# Question 10


### (a) Why are GLoVE embeddings trained on the ratio of co-occurrence probabilities rather than the probabilities themselves?

A:

Relevant relations between words could be extracted using the ratio of co-occurrence, in other words, specific properties and correlations could be extracted from the ratio, while irrelevant words, which may relate to both or neither word in the ratio, could be effectively identified. The use of simple probabilities instead of co-occurrence probabilities loses this property, making it potentially harder to differentiate between relevant and irrelevant words, as shown in the "ice" versus "steam" example in the paper.

### (b) In the two sentences: “James is running in the park.” and “James is running for the presidency.”, would GLoVE embeddings return the same vector for the word running in both cases? Why or why not?

A:

I would expect the GLoVE embeddings of the two to be different for any context window larger than 0, since both syntatic and semantic information greatly depends on the context (i.e. words preceding and following), which is different between the two given cases, as the target word has completely different meanings between them.

### (c) What do you expect for the values of,
||GLoVE["queen"] - GLoVE["king"] - GLoVE["wife"] + GLoVE["husband"]||2,
||GLoVE["queen"] - GLoVE["king"]||2 and ||GLoVE["wife"] - GLoVE["husband"]||2 ?Compare these values.

A:

I would expect ||GLoVE["queen"] - GLoVE["king"] - GLoVE["wife"] + GLoVE["husband"]||2 to be equivalent to the loss of the statement "A king is to a queen as a husband is to a wife", as formulated in section 4.1 of the paper, and since the relation of word analogies hold in this case, I expect the value to be somewhat relatively low.  ||GLoVE["queen"] - GLoVE["king"] ||2 and ||GLoVE["wife"] - GLoVE["husband"]||2, on the other hand, would be equivalent to the difference between "Queen and King" and the difference between "Wife and Husband", respectively, as their GLoVE embedded vectors. I expect both of these distances to be substantially larger than the first case (distance measure with all four terms), while being of similar magnitude, as to cancel out during the first case. In a vector sense, we could also characterize ||GLoVE["queen"] - GLoVE["king"] - GLoVE["wife"] + GLoVE["husband"]||2 as the magnitude of the difference between vectors GLoVE["queen"] - GLoVE["king"] and GLoVE["wife"] - GLoVE["husband"], which we would see the terms with similar magnitude (ones that satisfy the analogy relation) cancel out, leaving us with the sum of the differences between the word pairs.
Interestingly though, as shown in the code snippet below, the actual GLoVE embeddings deviate from this assumption, and the difference between the embeddings for "queen" and "king" are substantially larger than expected, potentially due to additional meanings and usage associated with the respective words, which in turn lead to an increase in the value which pertains to the case with all four words. We believe that this result is akin to the GLoVE embedding telling us that queen to king is actually not a great analogy for wife to husband.

### (d) Given a word, would you rather stem or lemmatize the word before mapping it to its GLoVE embedding?

A:

Assuming that performance is a non-issue (since stemming may be required over lemmatization if this assumption does not hold), I would prefer to lemmatize the words instead of stemming them, as GLoVE works on co-occurrence, specific context or meaning of the words should be preserved, for the model to obtain a better understanding of the underlying relation between words within a vocabulary. Stemming, on the other hand, may destroy this precious information while trading for simplicity and performance, which is likely to be undesirable.

# Question 11


In [None]:
pwd = !pwd
data_path = str(pwd[0]) + '/glove.6B.300d.txt'

In [None]:
embeddings_dict = {}
dimension_of_glove = 300
with open(data_path, 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_dict[word] = vector

### (a) Describe a feature engineering process that uses GLoVE word embeddings to represent each document.

In [None]:
#run this code after running definitions in Question 11
qk=embeddings_dict["queen"]-embeddings_dict["king"]
wh=embeddings_dict["wife"]-embeddings_dict["husband"]
def dist(vec):
  sum=0
  for e in vec:
    sum=sum+e**2
  return sum 
print(dist(qk))
print(dist(wh))
print(dist(qk-wh))

In [None]:
print(list(embeddings_dict.keys())[:100])

In [None]:
print(list(embeddings_dict.values())[0])

In [None]:
print(df['full_text'][0])

In [None]:
import re
re.findall(r"\w+|[^\w\s]", df['full_text'][0], re.UNICODE)

In [None]:
def GLoVE(rows,edict):
  embeddings=[]
  sums=[]
  for row in rows:
    words=re.findall(r"\w+|[^\w\s]", row, re.UNICODE)
    embedding=[]
    sum=np.zeros(len(list(edict.values())[0]))
    count=0
    for word in words:
      try:
        sum=sum+edict[word.lower()]
        embedding.append(edict[word.lower()])
        count+=1
      except:
        pass
    embeddings.append(embedding)
    sums.append(sum/count)
  return embeddings,sums

In [None]:
embeddings,sums=GLoVE(df['full_text'],embeddings_dict)
#print(sums[0])

### (b) Select a classifier model, train and evaluate it with your GLoVE-based feature. If you are doing any cross-validation, please make sure to use a limited set of options so that your code finishes running in a reasonable amount of time.

In [None]:
#Do classification either on a linear transform of <embeddings> or <sums>
from sklearn.linear_model import LogisticRegression
e300tr,s300tr=GLoVE(train['full_text'],embeddings_dict)
e300te,s300te=GLoVE(test["full_text"],embeddings_dict)
l2c = LogisticRegression(penalty='l2', random_state=9527,C=10, solver='liblinear', max_iter=100000)
l2c.fit(s300tr, y_train)
y_pred = l2c.predict(s300te)
print_result("300d",y_test,y_pred)

Here we observe that by aggregating all GLoVE embeddings into a single vector by taking the mean of the embedding vectors for every word in the document, we still retain enough information for our classification model (Logistic regression with l2 regularization) to efficiently differentiate between classes, achieving an accuracy of around 96.19%.

# Question 12


### Plot the relationship between the dimension of the pre-trained GLoVE embedding and the resulting accuracy of the model in the classification task. Describe the observed trend. Is this trend expected? Why or why not? In this part use the different sets of GLoVE vectors from the link.

In [None]:
pwd = !pwd

In [None]:
data_path = str(pwd[0]) + '/glove.6B.50d.txt'
embeddings_dict50 = {}
with open(data_path, 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_dict50[word] = vector
data_path = str(pwd[0]) + '/glove.6B.100d.txt'
embeddings_dict100 = {}
with open(data_path, 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_dict100[word] = vector
data_path = str(pwd[0]) + '/glove.6B.200d.txt'
embeddings_dict200 = {}
with open(data_path, 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_dict200[word] = vector

In [None]:
print(len(embeddings_dict50.keys()))
print(len(embeddings_dict100.keys()))
print(len(embeddings_dict200.keys()))
print(len(embeddings_dict.keys()))

print(len(list(embeddings_dict50.values())[0]))
print(len(list(embeddings_dict100.values())[0]))
print(len(list(embeddings_dict200.values())[0]))
print(len(list(embeddings_dict.values())[0]))

In [None]:
#test for different GLoVE embedding lengths
embeddings50,sums50=GLoVE(df['full_text'],embeddings_dict50)
embeddings100,sums100=GLoVE(df['full_text'],embeddings_dict100)
embeddings200,sums200=GLoVE(df['full_text'],embeddings_dict200)
embeddings,sums=GLoVE(df['full_text'],embeddings_dict)
print(sums50[0])
print(sums100[0])
print(sums200[0])
print(sums[0])

In [None]:
#do classification on all 4 dimension numbers
print_result("300d",y_test,y_pred)
e200tr,s200tr=GLoVE(train['full_text'],embeddings_dict200)
e200te,s200te=GLoVE(test["full_text"],embeddings_dict200)
l2c200 = LogisticRegression(penalty='l2', random_state=9527,C=10, solver='liblinear', max_iter=100000)
l2c200.fit(s200tr, y_train)
y_pred200 = l2c200.predict(s200te)
print_result("200d",y_test,y_pred200)
e100tr,s100tr=GLoVE(train['full_text'],embeddings_dict100)
e100te,s100te=GLoVE(test["full_text"],embeddings_dict100)
l2c100 = LogisticRegression(penalty='l2', random_state=9527,C=10, solver='liblinear', max_iter=100000)
l2c100.fit(s100tr, y_train)
y_pred100 = l2c100.predict(s100te)
print_result("100d",y_test,y_pred100)
e50tr,s50tr=GLoVE(train['full_text'],embeddings_dict50)
e50te,s50te=GLoVE(test["full_text"],embeddings_dict50)
l2c50 = LogisticRegression(penalty='l2', random_state=9527,C=10, solver='liblinear', max_iter=100000)
l2c50.fit(s50tr, y_train)
y_pred50 = l2c50.predict(s50te)
print_result("50d",y_test,y_pred50)

In [None]:
#plot dimension and accuracy
plt.plot([50,100,200,300],[accuracy_score(y_test,y_pred50),accuracy_score(y_test,y_pred100),accuracy_score(y_test,y_pred200),accuracy_score(y_test,y_pred)])
plt.xlabel("Dimension")
plt.ylabel("Accuracy")

We observe that the accuracy for our classification model is somewhat related to the dimension count of our GLoVE embedding, with the higher dimension embedding we get better accuracy. This is to be expected, as a higher dimension vector is able to preserve more information about the given document, especially after word embeddings are averaged over all words in a document, where individual embeddings may have its information muddled or lost.

# Question 13

### Compare and contrast the two visualizations. Are there clusters formed in either or both of the plots? 

A: 

There are clusters formed in the GLoVE-based Embeddings Vectors but not in the randomized vector. 

In [None]:
!pip install umap-learn
!pip install umap-learn[plot]
import umap
import umap.plot

In [None]:
#Normalized GLoVE-based embeddings of the documents with their binary labels
fit = umap.UMAP()
map = fit.fit_transform(s300te)

scatter = plt.scatter (map[:,0], map[:,1], c = y_test)
plt.title("GLoVE-based Embeddings Vectors")


In [None]:
#Generate a set of normalized random vectors of the same dimension as GLoVE

np.random.seed(9527)
dataset = np.random.rand(len(s300te),len(s300te[0]))
rand = fit.fit_transform(dataset) 
rand_labels = np.random.randint(2, size = rand.shape[0])
plt.scatter(rand[:,0], rand[:,1], c = rand_labels)
plt.title("Normalized Random Vectors")

