In [1]:
# Install and import necessary libraries
!pip install accelerate datasets emoji pandas sklearn torch torchvision transformers xformers
from datasets import load_dataset
from functools import partial
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import multiprocessing as mp
import os
import pandas as pd
import torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Collecting datasets
  Using cached datasets-2.13.1-py3-none-any.whl (486 kB)
Collecting emoji
  Using cached emoji-2.5.1.tar.gz (356 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sklearn
  Using cached sklearn-0.0.post5.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64.whl (109.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-a

In [2]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# Load and process data
def load_and_process_data():
    if os.path.exists("bad_reviews.csv"):
       bad_reviews = pd.read_csv("bad_reviews.csv")
    else:
      dataset = load_dataset("amazon_us_reviews", "Digital_Video_Games_v1_00")
      data_df = pd.DataFrame.from_dict(dataset['train'])
      bad_reviews = data_df.loc[(data_df["star_rating"] == 1) & (data_df['review_body'].str.len() < 128)].copy()
      if bad_reviews.empty:
          print("No reviews meet the criteria. Please check your filtering process.")
          return None
      bad_reviews.to_csv("bad_reviews.csv")
      return bad_reviews

In [4]:
# Sentiment analysis
def sentiment_analysis(bad_reviews):
    # Check if cached results exist
    if os.path.exists("sentiment_analysis_results.csv"):
        # Load cached results
        bad_reviews = pd.read_csv("sentiment_analysis_results.csv")
    else:
        # Perform sentiment analysis
        model_name = "finiteautomata/bertweet-base-sentiment-analysis"
        sentiment_pipeline = pipeline("text-classification", model=model_name, device=device)

        sentiments = sentiment_pipeline(bad_reviews['review_body'].tolist())

        sentiments_df = pd.DataFrame(sentiments)
        bad_reviews.reset_index(inplace=True, drop=True)
        sentiments_df.reset_index(inplace=True, drop=True)
        # Add sentiment label and score to bad_reviews DataFrame
        bad_reviews['sentiment_label'] = sentiments_df['label']
        bad_reviews['sentiment_score'] = sentiments_df['score']

        # Filter bad_reviews based on sentiment label and score
        bad_reviews = bad_reviews.loc[(bad_reviews['sentiment_label'].isin(['NEGATIVE', 'NEU'])) & (bad_reviews['sentiment_score'] > 0.7)]

        # Cache results
        bad_reviews.to_csv("sentiment_analysis_results.csv")

        return bad_reviews

In [5]:
# Save and load data
def save_and_load_data(bad_reviews):
    bad_reviews.to_csv("cleaner_data.csv")
    cleaned_data = pd.read_csv("cleaner_data.csv")
    return cleaned_data

In [6]:
def load_and_train_model(cleaned_data):
    if cleaned_data.empty:
        print("The cleaned data is empty. Please check your data cleaning process.")
        return None
    else:
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        # Check if trained model file exists
        if os.path.exists("trained_model.pt"):
            # Load trained model from file
            model = AutoModelForCausalLM.from_pretrained("trained_model.pt")
        else:
            # Train model
            model = AutoModelForCausalLM.from_pretrained("gpt2")

            # Split data into training and test sets
            train, test = train_test_split(cleaned_data.review_body, test_size = 0.15)
            train.to_csv("train.csv")
            test.to_csv("test.csv")

            # Load datasets
            train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.csv", block_size=128)
            test_dataset = TextDataset(tokenizer=tokenizer, file_path="test.csv", block_size=128)
            data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

            # Define training arguments
            training_args = TrainingArguments(
                output_dir="logs",
                overwrite_output_dir=True,
                num_train_epochs=3,
                per_device_train_batch_size=32,
                per_device_eval_batch_size=64,
                eval_steps = 400,
                save_steps=800,
                warmup_steps=500,
                prediction_loss_only=True,
            )

            # Train the model
            trainer = Trainer(
                model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
            )
            trainer.train()
            trainer.save_model("trained_model.pt")

        return model

In [7]:
# Generate text
def generate_text(model):
    if model is not None:
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        generator = pipeline('text-generation', model=model.to(device), tokenizer=tokenizer, device=device.index)

        prompt1 = "I HATE MY JOB"
        prompt2 = "Nobody understands me"

        print(generator(prompt1, max_length=150, num_return_sequences=3))
        print(generator(prompt2, max_length=150))

In [8]:
bad_reviews = load_and_process_data()
bad_reviews = sentiment_analysis(bad_reviews)
if bad_reviews is not None:
    cleaned_data = save_and_load_data(bad_reviews)
    model = load_and_train_model(cleaned_data)
    generate_text(model)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading metadata: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading and preparing dataset amazon_us_reviews/Digital_Video_Games_v1_00 to /root/.cache/huggingface/datasets/amazon_us_reviews/Digital_Video_Games_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563...


Downloading data:   0%|          | 0.00/27.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/145431 [00:00<?, ? examples/s]

Dataset amazon_us_reviews downloaded and prepared to /root/.cache/huggingface/datasets/amazon_us_reviews/Digital_Video_Games_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)solve/main/bpe.codes: 0.00B [00:00, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5222 > 1024). Running this sequence through the model will result in indexing errors


Step,Training Loss


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I HATE MY JOB!\n\n"But why?"\n\n"Because I get your vote. I was just so\n\nconfusably wrong."\n"Oh that\'s you! That\'d be me!"\n\n"Makes no sense!"\n\n"H-HA! It\'s a\n\n\nstatement-that\'s a\n\n\nmistake and is all\n\nbecause you do what\n\nI have to tell her\n\ndoesn\'t.\n\n"Why? Why doesn\'t you\n\nget\nthat? Where did you vote\n\nnow? Where have you voted,\n\nwhere? You didn\'t pay your\n\nchecks!"\n\n"Yes. I\'ve paid\n\nthe checks'}, {'generated_text': 'I HATE MY JOB: The Hateful Eight (2007) [audio]\n\n[video]\n\n[audio]\n\n[audio]\n\n\n[audio]\n[audio]]\n\n[audio][/url]\n\n[link]'}, {'generated_text': 'I HATE MY JOB HAWKES JORDYS JOB JOBJAS JJOBJAS JOBB JOBJAS JOBJASJBJOBJADARJAS JOBJACJJABJABJJABACJJAJASJABJACIJJABABJAASJABJACJJAACJABJAACJAAJABJAABJACJJABJAJACCJJABJAACJACJAATJABJAJACJACIJJJAJABJACJAAS JJACJJAJJABJAJACJJSAJACJAJAADJAIJJOJOIJJAJ'}]
[{'generated_text': "Nobody understands me! But I can't help but look at you, the one that is the one that made 