# **CMPUT 501 Project**

##ML Approach - Task 2

Load modules

In [1]:
from collections import Counter
import re
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
import gensim.downloader as api

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from scipy.stats import describe
import seaborn as sns

from sklearn.svm import NuSVR
    
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

# Set theme of seaborn 
sns.set()


Load data

In [2]:
#upload files named "train.csv" "dev.csv" "test.csv" from data folder in the current runtime, by clicking the file icon on the left
#and uploading to session storage
train_df = pd.read_csv("train.csv", dtype=str)
dev_df = pd.read_csv("dev.csv", dtype=str)
test_df = pd.read_csv("test.csv", dtype=str)
#for development
test_df = dev_df.copy()

In [3]:
y_train1 = train_df["meanGrade1"].astype(float)
y_test1 = test_df["meanGrade1"].astype(float)


In [4]:
y_train2 = train_df["meanGrade2"].astype(float)
y_test2 = test_df["meanGrade2"].astype(float)

Feature Engineering

In [5]:
#Remove < and /> from original headline and create a new headline using the edited word
train_df["new1"] = train_df.apply(
    lambda x: re.sub(r"<.+/>", x["edit1"], x["original1"]), axis=1
)
train_df["original1"] = train_df["original1"].str.replace(r"<(.+)/>", "\g<1>")

test_df["new1"] = test_df.apply(
    lambda x: re.sub(r"<.+/>", x["edit1"], x["original1"]), axis=1
)
test_df["original1"] = test_df["original1"].str.replace(r"<(.+)/>", "\g<1>")

#Load stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
#Remove < and /> from original headline and create a new headline using the edited word
train_df["new2"] = train_df.apply(
    lambda x: re.sub(r"<.+/>", x["edit2"], x["original2"]), axis=1
)
train_df["original2"] = train_df["original2"].str.replace(r"<(.+)/>", "\g<1>")

test_df["new2"] = test_df.apply(
    lambda x: re.sub(r"<.+/>", x["edit2"], x["original2"]), axis=1
)
test_df["original2"] = test_df["original2"].str.replace(r"<(.+)/>", "\g<1>")

#Load stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word Embedding

In [7]:
#Ref: https://towardsdatascience.com/using-word2vec-to-analyze-news-headlines-and-predict-article-success-cdeda5f14751
#Ref: https://code.google.com/archive/p/word2vec/

#The below step requires some time, downloads 1662.8 MB
model = api.load("word2vec-google-news-300")



In [8]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.vocab]
    return np.mean(model[doc], axis=0)

def preprocess(text):
    # Lowercase and Tokenize
    doc = word_tokenize(text.lower())
    # Remove Stop Words
    doc = [word for word in doc if word not in stop_words]
    # Remove non-alphabet tokens
    doc = [word for word in doc if word.isalpha()]
    return doc

def to_vector(df, column):
    headlines = df[column].tolist()
    corpus = [preprocess(title) for title in headlines]
    X = []
    # append the vector for each document
    for doc in corpus:  
        vector = document_vector(model, doc)
        X.append(vector)
    return np.array(X)

In [9]:
nltk.download('punkt')
X_train_orig1 = to_vector(train_df, "original1")
X_train_new1 = to_vector(train_df, "new1")
X_test_orig1 = to_vector(test_df, "original1")
X_test_new1 = to_vector(test_df, "new1")

#Check to make sure that the sizes match
assert len(X_train_orig1) == len(X_train_new1) == len(train_df)
assert len(X_test_orig1) == len(X_test_new1) == len(test_df) 

#Combine the vectors from the original and edited headline
X_train1 = [
    np.concatenate((X_train_orig1[c], X_train_new1[c]), axis=None)
    for c in range(len(X_train_orig1))
]
X_test1 = [
    np.concatenate((X_test_orig1[c], X_test_new1[c]), axis=None)
    for c in range(len(X_test_orig1))
]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
nltk.download('punkt')
X_train_orig2 = to_vector(train_df, "original2")
X_train_new2 = to_vector(train_df, "new2")
X_test_orig2 = to_vector(test_df, "original2")
X_test_new2 = to_vector(test_df, "new2")

#Check to make sure that the sizes match
assert len(X_train_orig2) == len(X_train_new2) == len(train_df)
assert len(X_test_orig2) == len(X_test_new2) == len(test_df) 

#Combine the vectors from the original and edited headline
X_train2 = [
    np.concatenate((X_train_orig2[c], X_train_new2[c]), axis=None)
    for c in range(len(X_train_orig1))
]
X_test2 = [
    np.concatenate((X_test_orig2[c], X_test_new2[c]), axis=None)
    for c in range(len(X_test_orig2))
]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Utility Function

In [11]:
#Function to round the predictions to nearest even decimal since that is the form of the train set
def roundPred(predictions):
    final_predictions = []
    for val in predictions:
        val = int(val * 10)
        final_predictions.append(val / 10)
    return final_predictions

SVM

In [12]:
#NuSVR
nu_svr = NuSVR()
nu_svr.fit(X_train1, y_train1)

predictions1 = roundPred(nu_svr.predict(X_test1))

sub_df = pd.DataFrame(columns=["id1", "pred1", "actual1","id2", "pred2", "actual2", "labelp", "label"])

sub_df["label"] = test_df["label"]
sub_df["id1"] = test_df["id"]
sub_df["pred1"] = predictions1
sub_df["actual1"] = test_df["meanGrade1"]
assert len(sub_df) == len(test_df)

nu_svr.fit(X_train2, y_train2)
predictions2 = roundPred(nu_svr.predict(X_test2))

sub_df["id2"] = test_df["id"]
sub_df["pred2"] = predictions2
sub_df["actual2"] = test_df["meanGrade2"]
assert len(sub_df) == len(test_df)


In [13]:
def label(x):
   if x['pred1'] == x['pred2']:
     x['labelp'] = 0
   elif x['pred1'] > x['pred2']:
     x['labelp'] = 1
   else:
     x['labelp'] = 2
   return x['labelp']


sub_df['labelp'] = sub_df.apply(label, axis=1)

sub_df.to_csv("task2_labeling_results.csv", index=False)

In [14]:
final_df = pd.DataFrame(columns=["actual", "predicted"])
final_df["actual"] = sub_df["label"]
final_df["predicted"] = sub_df["labelp"]
final_df.to_csv('task2_classification_results.csv', index=False)

In [15]:
from sklearn.metrics import classification_report

dt = pd.read_csv('task2_classification_results.csv')
df = pd.DataFrame(dt)

gt= df["actual"]
pred = df['predicted'] 
print("\nclassification_report",classification_report(gt, pred))



classification_report               precision    recall  f1-score   support

           0       0.11      0.15      0.12       256
           1       0.55      0.53      0.54      1079
           2       0.53      0.49      0.51      1020

    accuracy                           0.47      2355
   macro avg       0.40      0.39      0.39      2355
weighted avg       0.49      0.47      0.48      2355

