In [3]:
import pandas as pd
import torch
import re
import gzip
import json
import csv
from transformers import pipeline

# testing out using zip file from https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/#subsets

path_zip = "../../data/review-Alabama_10.json.gz"
output_file = "classified_reviews.csv"
# test out with the zip file and write the output to a csv file

max_reviews = 10  # number of reviews to process, I am doing only 10 for testing
processed = 0

classifier= pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["relevant", "advertisement", "spam", "rant"] 

with open(output_file, "w", newline='', encoding='utf-8') as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["text", "label", "score"])

    with gzip.open(path_zip, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            review = json.loads(line)
            text = review.get('text', '')
            if not text or text.lower() in ['null', 'none']:
                continue

            text = re.sub(r'\s+', ' ', text).strip()

            result = classifier(text, candidate_labels=labels)
            best_index = result['scores'].index(max(result['scores']))
            best_label = result['labels'][best_index]
            best_score = result['scores'][best_index]

            writer.writerow([text, best_label, best_score])

            if i < 10:
                print(f"Text: {text[:200]}...\nLabel: {best_label}, Score: {best_score:.4f}\n")

            processed += 1
            if processed >= max_reviews:
                break

Device set to use cpu


Text: Very Personable staff! Beautiful and clean environment. I will definitely become a regular customer!!...
Label: relevant, Score: 0.9147

Text: Best clothing intown...
Label: relevant, Score: 0.9260

Text: Not friendly at all, as I ask questions about a seat it was like I was nothing more than a bother.... I am all for supporting local businesses. However, the attitude did it for me on this one!...
Label: relevant, Score: 0.8970

Text: They have beautiful baby and children's clothing, shoes, cloth diapers, jewelry including amber necklaces. Cribs and bedding. Hair bows galore! I love going in to shop for my granddaughter occasionall...
Label: relevant, Score: 0.7574

Text: Cute shop, but the lack of boy clothes is sad. There were multiple racks of girl clothes and maybe 2 racks of boy clothes. Check out All About Baby in Huntsville instead, many more boy clothing option...
Label: relevant, Score: 0.8216

Text: Great local shop for all your baby needs. The staff is friendly and ver

In [6]:
import pandas as pd
import torch
import re
import gzip
import json
import csv
from transformers import pipeline

path_zip = "../../data/review-Alabama_10.json.gz"
output_file = "classified_reviews.csv"

max_reviews = 10
processed = 0

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["relevant", "advertisement", "spam", "rant"]

# Sentiment classifier to check if rating matches with the review
sentiment_classifier = pipeline("sentiment-analysis")  # fine-tuned for positive/negative sentiment

with open(output_file, "w", newline='', encoding='utf-8') as out_csv:
    writer = csv.writer(out_csv)
    writer.writerow(["text", "rating", "label", "score", "rating_flag", "final_prediction"])

    with gzip.open(path_zip, 'rt', encoding='utf-8') as f:
        for line in f:
            review = json.loads(line)
            text = review.get('text', '')

            if not text or text.lower() in ['null', 'none']:
                continue

            text = re.sub(r'\s+', ' ', text).strip()
            rating = review.get('rating', None)

            # main classification
            result = classifier(text, candidate_labels=labels)
            best_index = result['scores'].index(max(result['scores']))
            best_label = result['labels'][best_index]
            best_score = result['scores'][best_index]

            # sentiment analysis
            sentiment_result = sentiment_classifier(text)[0]  # returns dict with 'label' and 'score'
            best_sentiment = sentiment_result['label'].lower()  # 'positive' or 'negative'

            # determine rating_flag
            rating_flag = False
            if rating is not None:
                if rating >= 4 and best_sentiment != "positive":
                    rating_flag = True
                elif rating <= 2 and best_sentiment != "negative":
                    rating_flag = True
            # if rating doesnt match with review, then rating_flag is True

            if best_label == "relevant" and not rating_flag:
                final_prediction = "relevant"
            else:
                final_prediction = "irrelevant"

            writer.writerow([text, rating, best_label, best_score, rating_flag, final_prediction])

            processed += 1
            if processed >= max_reviews:
                break

Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
