# COMP47670 - Assignment 2 - Text Classification
**Student Name: Meleesha Mayola Dsouza, Nikil Mohan**<br>
**Student Number: 18200024, 18200037**

In [1]:
# Import the required libraries
import os
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## Task 1: Selecting the review categories and scraping the data

For the purpose of this assignment, we have chosen the following categories: 
1. Health and Medical
2. Automotive


In [5]:
# For specifying the website, we append the individual webpages to the base website URL
endpoint = 'http://mlg.ucd.ie/modules/yalp/'
# The categories chosen have been represented in the form of a list
categories = ['health_medical_list.html', 'automotive_list.html'] 

# The extract_page function is created to extract the overall data present in the webpage
def extract_page(url):
    response = requests.get(url)
    data = response.text
    # We use the BeautifulSoup package to parse the data that is collected for webscraping
    soup = BeautifulSoup(data,'html.parser')
    return soup

# The extract_reviews function is created to extract the reviews from the webpoge
def extract_reviews(url):
    soup = extract_page(url)
    # We use the 'div' as the identifier as all the reviews are present within this tag
    reviews_block = soup.findAll("div", { "class" : "review" })
    # We create a list called review_list to store all the reviews that are present in the category
    review_list =[]
    for reviews in reviews_block:
        review = {}
        # We use the 'img' as the identifier as all the ratings are present within this tag
        star = reviews.find('img')
        review["comments"] = reviews.find("p", { "class" : "text" }).get_text()
        # We use the 'alt' attribute as the identifier as the rating present in this attribute can be easily obtained
        # We use the concept of any rating having a value of 4 or 5 as Positive
        # We use the concept of any rating having a value of 1, 2 or 3 as Negative
        review["rating"] = 'positive' if int(star.get('alt').split('-')[0]) >= 4 else 'negative'
        review_list.append(review)        
    return review_list

# The extract_data function is created to extract the data from each of the review links
# This function in turn calls the extract_page and the extract_reviews
def extract_data(url):
    soup = extract_page(url)
    links = soup.find_all('a')
    review_list =[]
    for link in links:
        # We identify the links of each of the reviews by extracting the values present in the href attribute
        url = endpoint + link.get('href')
        review = extract_reviews(url)
        # We concatenate the reviews from all the links
        review_list = review_list+review
    return review_list

# The create_database function is created to write the reviews and ratings to csv files for easier processing
# The function will be called each time for each category
def create_database(category_name, reviews):
    header = reviews[0].keys()
    with open(category_name, 'w',newline='', encoding="utf-8") as output_file:
        dict_writer = csv.DictWriter(output_file, header)
        dict_writer.writeheader()
        dict_writer.writerows(reviews)


Create database for category A, Health and medical reviews

In [6]:
# We access the page for the reviews of the category 'Health and Medical'
# We scrape the data using the functions defined by us
# This data is stored in separate files in the csv format
url = endpoint + categories[0]
review_list = extract_data(url)
create_database('Health_Medical.csv', review_list)

Create database for category B, Automotive reviews

In [7]:
# We access the page for the reviews of the category 'Automotive'
# We scrape the data using the functions defined by us
# This data is stored in separate files in the csv format
url = endpoint + categories[1]
review_list = extract_data(url)
create_database('Automotive.csv', review_list)

In [14]:
# The dataframe dataset_A is a dataframe that has the data obtained from csv
dataset_A = pd.read_csv('Health_Medical.csv')
print(dataset_A.shape)
dataset_A.head()

(1450, 2)


Unnamed: 0,comments,rating
0,I have so many good things to say about this p...,positive
1,I found them to be highly skilled and an exper...,positive
2,Where do I even begin? This office has been so...,positive
3,I went in because I had toothache and needed a...,positive
4,Found a new dental office. This place is amazi...,positive


In [15]:
# The dataframe dataset_B is a dataframe that has the data obtained from csv
dataset_B = pd.read_csv('Automotive.csv')
print(dataset_B.shape)
dataset_B.head()

(1455, 2)


Unnamed: 0,comments,rating
0,I arrived at 3 PM and the dealership closed at...,positive
1,I dropped my car off on a Wednesday morning fo...,negative
2,My parents have been buying cars off of Donna ...,positive
3,I recently bought another car from Donna Dunni...,positive
4,I had to schedule an appointment due to the ai...,positive


In [None]:
def preprocess_text(text):
    # this function takes a text as an input 
    # tokenizes the text into individual tokens
    tokenized_words = word_tokenize(text)
    # removes the punctuation
    normalised_words = [word for word in tokenized_words if word.isalpha()]
    
    # removes all the stop words
    return ' '.join([word for word in normalised_words if word.lower() not in stopwords.words('english')])
    

TASK 2 cross validation

task 3 hold out testing , i.e. train on data from one category and test on data from another.

In [None]:
def preprocess_reviews(dataset):
    for indx, comments in enumerate(dataset['comments']):
        dataset['comments'][indx] = preprocess_text(comments)

In [None]:
preprocess_reviews(dataset_A)

In [None]:
dataset_A.head()

In [None]:
preprocess_reviews(dataset_B)

In [None]:
dataset_B.head()