# COMP47670 - Assignment 2 - Text Classification
**Student Name: Meleesha Mayola Dsouza, Nikil Mohan**<br>
**Student Number: 18200024, 18200037**

In [5]:
# Import the required libraries
import os
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC  
from sklearn.metrics import *
%matplotlib inline

## Task 1: Selecting the review categories and scraping the data from the website

We will be performing the following steps under task 1:<br/>
Step 1: Choosing the review categories. We have chosen the following 
    <br/>&emsp;&emsp;&emsp;&emsp;_Health and Medical_
    <br/>&emsp;&emsp;&emsp;&emsp;_Automotive_<br/>
Step 2: Scraping the user reviews from the websites using the python package Beautiful Soup<br/>
Step 3: Assigning class labels as 'positive' or 'negative' based on the user provided star ratings<br/>
Step 4: Storing the user reviews in separate csv files for different review categories<br/>
Step 5: Importing from the csv files to dataframe for easy data science manipulations

In [6]:
# For specifying the website, we append the individual webpages to the base website URL
endpoint = 'http://mlg.ucd.ie/modules/yalp/'
# The categories chosen have been represented in the form of a list
categories = ['hotels_travel_list', 'automotive_list.html']

# The extract_page function is created to extract the overall data present in the webpage
def extract_page(url):
    response = requests.get(url)
    data = response.text
    # We use the BeautifulSoup package to parse the data that is collected for webscraping
    soup = BeautifulSoup(data,'html.parser')
    return soup

# The extract_reviews function is created to extract the reviews from the webpoge
def extract_reviews(url):
    soup = extract_page(url)
    # We use the 'div' as the identifier as all the reviews are present within this tag
    reviews_block = soup.findAll("div", { "class" : "review" })
    # We create a list called review_list to store all the reviews that are present in the category
    review_list =[]
    for reviews in reviews_block:
        review = {}
        # We use the 'img' as the identifier as all the ratings are present within this tag
        star = reviews.find('img')
        review["comments"] = reviews.find("p", { "class" : "text" }).get_text()
        # We use the 'alt' attribute as the identifier as the rating present in this attribute can be easily obtained
        # We use the concept of any rating having a value of 4 or 5 as Positive
        # We use the concept of any rating having a value of 1, 2 or 3 as Negative
        review["rating"] = 'positive' if int(star.get('alt').split('-')[0]) >= 4 else 'negative'
        review_list.append(review)        
    return review_list

# The extract_data function is created to extract the data from each of the review links
# This function in turn calls the extract_page and the extract_reviews
def extract_data(url):
    soup = extract_page(url)
    links = soup.find_all('a')
    review_list =[]
    for link in links:
        # We identify the links of each of the reviews by extracting the values present in the href attribute
        url = endpoint + link.get('href')
        review = extract_reviews(url)
        # We concatenate the reviews from all the links
        review_list = review_list+review
    return review_list

# The create_database function is created to write the reviews and ratings to csv files for easier processing
# The function will be called each time for each category
def create_database(category_name, reviews):
    header = reviews[0].keys()
    with open(category_name, 'w',newline='', encoding="utf-8") as output_file:
        dict_writer = csv.DictWriter(output_file, header)
        dict_writer.writeheader()
        dict_writer.writerows(reviews)

We create a database for Health and Medical reviews so that we can process them.

In [7]:
# We access the page for the reviews of the category 'Health and Medical'
# We scrape the data using the functions defined by us
# This data is stored in separate files in the csv format
url = endpoint + categories[0]
review_list = extract_data(url)
create_database('Health_Medical.csv', review_list)

We create a database for Automotive reviews so that we can process them.

In [8]:
# We access the page for the reviews of the category 'Automotive'
# We scrape the data using the functions defined by us
# This data is stored in separate files in the csv format
url = endpoint + categories[1]
review_list = extract_data(url)
create_database('Automotive.csv', review_list)

In [9]:
# The dataframe dataset_A has the data obtained from csv containing Health and Medical reviews
dataset_A = pd.read_csv('Health_Medical.csv')
# We check the dimensions of the Health and Medical review data that we have
print('Dimensions of the Health and Medical dataset: '+str(dataset_A.shape))
print('\n')
# We also check the count of reviews in the Health and Medical dataset for each label, i.e. postive or negative
print(dataset_A.groupby('rating')['rating'].count())
# A preview of the data is shown
dataset_A.head()

Dimensions of the Health and Medical dataset: (1430, 2)


rating
negative    637
positive    793
Name: rating, dtype: int64


Unnamed: 0,comments,rating
0,I have used this service before and was satisf...,negative
1,First time I tried using them the driver was n...,negative
2,Driver picked me up on time at 6AM and even kn...,positive
3,First time using this taxi and limo company as...,positive
4,Great experience. Driver was early and was ver...,positive


In [10]:
# The dataframe dataset_B has the data obtained from csv containing Automotive reviews
dataset_B = pd.read_csv('Automotive.csv')
# We check the dimensions of the Automotive review data that we have
print('Dimensions of the Automotive dataset: '+str(dataset_B.shape))
print('\n')
# We also check the count of reviews in the Automotive dataset for each label, i.e. postive or negative
print(dataset_B.groupby('rating')['rating'].count())
# A preview of the data is shown
dataset_B.head()

Dimensions of the Automotive dataset: (1455, 2)


rating
negative    482
positive    973
Name: rating, dtype: int64


Unnamed: 0,comments,rating
0,I arrived at 3 PM and the dealership closed at...,positive
1,I dropped my car off on a Wednesday morning fo...,negative
2,My parents have been buying cars off of Donna ...,positive
3,I recently bought another car from Donna Dunni...,positive
4,I had to schedule an appointment due to the ai...,positive


## Task 2: Applying the pre-processing steps and building the classifier model

To perform the preprocessing steps, we have used nltk packages. Some of the preprocessing steps done are as follows
<br/>&emsp;&emsp;&emsp;&emsp;Tokenization
<br/>&emsp;&emsp;&emsp;&emsp;Removal of punctuations
<br/>&emsp;&emsp;&emsp;&emsp;Conversion to lower case
<br/>&emsp;&emsp;&emsp;&emsp;Removal of stop words
<br/><br/>
From the reviews in this category, apply appropriate preprocessing steps to create a numeric representation of the data, suitable for classification.
Build a classification model using a classifier of your choice, to distinguish between “positive” and “negative” reviews.
Test the predictions of the classification model using an appropriate evaluation strategy. Report and discuss the evaluation results in your notebook.

We then use this preprocessed data to build the classifier models. We build a differnet classifier for each category of reviews to classify the positive and negative reviews. After building this classifier, we use an evaluation stategy to test the predictions for each of the classifiers. The evaluation strategy used in this assignment is the 10-fold cross-validation.

In [11]:
# The preprocess_reviews function is used to handle the pre-processing of all the reviews in the webpage
# In turn, it calls the preprocess_text function to pre-process the reviews one by one
def preprocess_reviews(dataset):
    for indx, comments in enumerate(dataset['comments']):
        dataset['comments'][indx] = preprocess_text(comments)

# The preprocess_text function is called to handle the preprocessing steps for each review individually
def preprocess_text(text):
    # The data that is passed as the input is first converted to lowercase to standardize it
    # We then tokenize this data into individual tokens
    tokenized_words = word_tokenize(text.lower())
    # We remove the words which are not alpha-numeric like punctuations
    normalised_words = [word for word in tokenized_words if word.isalnum()]
    # We remove the stopwords in the corpus like 'is', 'an', etc. as they do not contribute to the efficiency of the classifier
    return ' '.join([word for word in normalised_words if word.lower() not in stopwords.words('english')])

In [12]:
# We perform all the preprocessing steps on the reviews of the Health and Medical reviews
preprocess_reviews(dataset_A)
# We have displayed a few of the entries to show the difference between the data after preprocessing
# The data before preprocessing for the Health and Medical datatset has already been displayed previously
dataset_A.head()

Unnamed: 0,comments,rating
0,used service satisfied june 20 prearranged via...,negative
1,first time tried using driver nowhere found ca...,negative
2,driver picked time 6am even knocked door inste...,positive
3,first time using taxi limo company used others...,positive
4,great experience driver early friendly got ear...,positive


In [13]:
# We perform all the preprocessing steps on the reviews of the Automotive reviews
preprocess_reviews(dataset_B)
# We have displayed a few of the entries to show the difference between the data after preprocessing
# The data before preprocessing for the Automotive dataset has already been displayed previously
dataset_B.head()

Unnamed: 0,comments,rating
0,arrived 3 pm dealership closed 6 thought left ...,positive
1,dropped car wednesday morning diagnostic john ...,negative
2,parents buying cars donna dunnivan decade alwa...,positive
3,recently bought another car donna dunnivan 1 c...,positive
4,schedule appointment due airbag recall used we...,positive


### Building the classifier for the Health and Medical Review data

We first apply standard text pre-processing steps to generate the document-term matrix for the reviews of Health and Medical category. For this category of reviews, we use the Logistic Regression classifier.

In [14]:
tfidf_vectorizer = TfidfVectorizer()
X_A = tfidf_vectorizer.fit_transform(dataset_A['comments'])
# We set the target value for the classifier
target_A = dataset_A['rating']
X_A.shape

(1430, 10618)

In [15]:
# We partition the data to use for the classifier. We use a split of 70% training and 30% test data
data_A_train, data_A_test, target_A_train, target_A_test = train_test_split(X_A, target_A, test_size=0.3)
print("The training set for Health and Medical reviews has %d examples" % data_A_train.shape[0] )
print("The test set for Health and Medical reviews has %d examples" % data_A_test.shape[0] )

The training set for Health and Medical reviews has 1001 examples
The test set for Health and Medical reviews has 429 examples


In [16]:
# For the purpose of classification for the Health and Medical reviews, we use Logistic Regression
model_A = linear_model.LogisticRegression(solver='liblinear')
model_A.fit(data_A_train, target_A_train)
print(model_A)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)


In [17]:
# After training the model, we perform the actual prediction
predicted_A = model_A.predict(data_A_test)
# We build a confusion matrix so that we can see the classifiers performancee
cm_A = confusion_matrix(target_A_test, predicted_A,labels=['positive','negative'])
print(cm_A)

[[211  16]
 [ 41 161]]


In [27]:
# We use metrics to check the performance of the Logistic regression clasifier on the Health and Medical data
CategoryA_accuracy_LogReg = accuracy_score(target_A_test, predicted_A)
print("Accuracy of the Logistic Regression classifier = %.2f" % CategoryA_accuracy_LogReg )
# We indicate that we are interested in the Positive class here, which is labelled as "positive"
print("Precision (Positive) = %.2f" % precision_score(target_A_test, predicted_A, pos_label='positive') )
print("Recall (Positive) = %.2f" % recall_score(target_A_test, predicted_A, pos_label='positive') )
print("F1 (Positive) = %.2f" % f1_score(target_A_test, predicted_A, pos_label='positive') )

Accuracy of the Logistic Regression classifier = 0.87
Precision (Positive) = 0.84
Recall (Positive) = 0.93
F1 (Positive) = 0.88


In [19]:
# A summary of the statisitcs is geenrated using scikit-learn's built in methods
print(classification_report(target_A_test, predicted_A, target_names=["negative","positive"]))

              precision    recall  f1-score   support

    negative       0.91      0.80      0.85       202
    positive       0.84      0.93      0.88       227

   micro avg       0.87      0.87      0.87       429
   macro avg       0.87      0.86      0.87       429
weighted avg       0.87      0.87      0.87       429



A better evaluation mechanism would be to use the k-fold cross-validation approach. In the standard split, we are ignoring a part of the dataset. In this cross validation process, every portion of the data has the ability to be part of the training set at one point.

In [20]:
# Cross validation for the Logistic Regression approach used for classifying Health and Medical reviews
acc_scores_A =  cross_val_score(model_A, X_A, target_A, cv=10, scoring="accuracy")
print("Logistic Regression classifier for Health and Medical reviews: Mean cross-validation accuracy = %.2f" % acc_scores_A.mean() )

Logistic Regression classifier for Health and Medical reviews: Mean cross-validation accuracy = 0.87


### Building the classifier for the Automotive Review data

We apply standard text pre-processing steps to generate the document-term matrix for the reviews of Automotive category.  For this category of reviews, we use the K-Nearest Neighbor (KNN) classifier.

In [21]:
tfidf_vectorizer = TfidfVectorizer()
X_B = tfidf_vectorizer.fit_transform(dataset_B['comments'])
# We set the target value for the classifier
target_B = dataset_B['rating']
X_B.shape

(1455, 7843)

In [22]:
# We partition the data to use for the classifier. We use a split of 70% training and 30% test data
data_B_train, data_B_test, target_B_train, target_B_test = train_test_split(X_B, target_B, test_size=0.3)
print("The training set for Automotive reviews has %d examples" % data_B_train.shape[0] )
print("The test set for Automotive reviews has %d examples" % data_B_test.shape[0] )

The training set for Automotive reviews has 1018 examples
The test set for Automotive reviews has 437 examples


In [23]:
# For the purpose of classification for the Automotive reviews, we use KNN where n=3
model_B = KNeighborsClassifier(n_neighbors=3)
model_B.fit(data_B_train, target_B_train)
print(model_B)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')


In [24]:
# After training the model, we perform the actual prediction
predicted_B = model_B.predict(data_B_test)
# We build a confusion matrix so that we can see the classifiers performancee
cm_B = confusion_matrix(target_B_test, predicted_B,labels=['positive','negative'])
print(cm_B)

[[256  31]
 [ 60  90]]


In [28]:
# We use metrics to check the performance of the KNN clasifier on the Automotive data
CategoryB_accuracy_KNN = accuracy_score(target_B_test, predicted_B)
print("Accuracy of the KNN classifier = %.2f" % CategoryB_accuracy_KNN )
# We indicate that we are interested in the Positive class here, which is labelled as "positive"
print("Precision (Positive) = %.2f" % precision_score(target_B_test, predicted_B, pos_label='positive') )
print("Recall (Positive) = %.2f" % recall_score(target_B_test, predicted_B, pos_label='positive') )
print("F1 (Positive) = %.2f" % f1_score(target_B_test, predicted_B, pos_label='positive') )

Accuracy of the KNN classifier = 0.79
Precision (Positive) = 0.81
Recall (Positive) = 0.89
F1 (Positive) = 0.85


A better evaluation mechanism would be to use the k-fold cross-validation approach. In the standard split, we are ignoring a part of the dataset. In this cross validation process, every portion of the data has the ability to be part of the training set at one point.

In [26]:
# Cross validation for the KNN approach used for classifying Automotive reviews
acc_scores_B =  cross_val_score(model_B, X_B, target_B, cv=10, scoring="accuracy")
print("KNN classifier for Automotive reviews: Mean cross-validation accuracy = %.2f" % acc_scores_B.mean() )

KNN classifier for Automotive reviews: Mean cross-validation accuracy = 0.73


## Discussion of results and evaluation strategy

## Task 3: Performance of classification models across categories

In this section, we will use the classification model trained on some data from one category to test it with the remaining data from the same category, as well as, test it with the entire data from the other category. We have already trained two different classifiers on the two categories of reviews chosen. The two classifiers used in the previous task will be used to check the performance across categories.

For the Health and Medical review category, we have chosen Linear Regression Classifier with a train-test split of 30% as test data.
Using this classifier model on the test data of the Health and Medical Category itself, we have an accuracy of $$$$$$$$
We will now use this classifier model to test the data of the Automotive Category



In [49]:
dataset_C=pd.concat([dataset_A,dataset_B])

In [51]:
tfidf_vectorizer = TfidfVectorizer()
X_C = tfidf_vectorizer.fit_transform(dataset_C['comments'])
# We set the target value for the classifier
target_C = dataset_C['rating']
X_C.shape

(2885, 13690)

In [58]:
X_category_A = X_C[:1430]
y_category_A = target_C[:1430]
X_category_B = X_C[1431:]
y_category_B = target_C[1431:]

In [60]:
# We partition the data to use for the classifier. We use a split of 70% training and 30% test data
X_train_category_A, X_test_category_A, y_train_category_A, y_test_category_A = train_test_split(X_category_A, 
                                                                                                y_category_A, 
                                                                                                test_size=0.3)
print("The training set for Health and Medical reviews has %d examples" % X_train_category_A.shape[0] )
print("The test set for Health and Medical reviews has %d examples" % X_test_category_A.shape[0] )

The training set for Health and Medical reviews has 1001 examples
The test set for Health and Medical reviews has 429 examples


In [61]:
# For the purpose of classification for the Health and Medical reviews, we use Logistic Regression
logreg_A = linear_model.LogisticRegression(solver='liblinear')
logreg_A.fit(X_train_category_A, y_train_category_A)
print(logreg_A)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)


In [62]:
# After training the model, we perform the actual prediction
Prediction_same_LogReg = logreg_A.predict(X_test_category_A)
# We build a confusion matrix so that we can see the classifiers performancee
cm_A = confusion_matrix(y_test_category_A, Prediction_same_LogReg,labels=['positive','negative'])
print(cm_A)

[[230  12]
 [ 39 148]]


In [63]:
# We use metrics to check the performance of the Logistic regression clasifier on the Health and Medical data
accuracy_same_LogReg = accuracy_score(y_test_category_A, Prediction_same_LogReg)
print("Accuracy of the Logistic Regression classifier = %.2f" % accuracy_same_LogReg )
# We indicate that we are interested in the Positive class here, which is labelled as "positive"
print("Precision (Positive) = %.2f" % precision_score(y_test_category_A, Prediction_same_LogReg, pos_label='positive') )
print("Recall (Positive) = %.2f" % recall_score(y_test_category_A, Prediction_same_LogReg, pos_label='positive') )
print("F1 (Positive) = %.2f" % f1_score(y_test_category_A, Prediction_same_LogReg, pos_label='positive') )

Accuracy of the Logistic Regression classifier = 0.88
Precision (Positive) = 0.86
Recall (Positive) = 0.95
F1 (Positive) = 0.90


In [64]:
# A summary of the statisitcs is generated using scikit-learn's built in methods
print(classification_report(y_test_category_A, Prediction_same_LogReg, target_names=["negative","positive"]))

              precision    recall  f1-score   support

    negative       0.93      0.79      0.85       187
    positive       0.86      0.95      0.90       242

   micro avg       0.88      0.88      0.88       429
   macro avg       0.89      0.87      0.88       429
weighted avg       0.89      0.88      0.88       429



In [69]:
# After training the model, we perform the actual prediction
Prediction_diff_LogReg = logreg_A.predict(X_category_B)
# We build a confusion matrix so that we can see the classifiers performancee
cm_A = confusion_matrix(y_test_category_A, Prediction_same_LogReg,labels=['positive','negative'])
print(cm_A)

[[230  12]
 [ 39 148]]


In [71]:
# We use metrics to check the performance of the Logistic regression clasifier on the Automotive data
accuracy_diff_LogReg = accuracy_score(y_category_B, Prediction_diff_LogReg)
print("Accuracy of the Logistic Regression classifier = %.2f" % accuracy_diff_LogReg )
# We indicate that we are interested in the Positive class here, which is labelled as "positive"
print("Precision (Positive) = %.2f" % precision_score(y_category_B, Prediction_diff_LogReg, pos_label='positive') )
print("Recall (Positive) = %.2f" % recall_score(y_category_B, Prediction_diff_LogReg, pos_label='positive') )
print("F1 (Positive) = %.2f" % f1_score(y_category_B, Prediction_diff_LogReg, pos_label='positive') )

Accuracy of the Logistic Regression classifier = 0.84
Precision (Positive) = 0.96
Recall (Positive) = 0.79
F1 (Positive) = 0.87




For the Automotive review category, we have chosen K-Nearest Neighbor Classifier with K=3 and a train-test split of 30% as test data.
Using this classifier model on the test data of the Health and Medical Category, we have an accuracy of $$$$$$$$