In [2]:
# Install and import necessary libraries
!pip install accelerate datasets emoji pandas sklearn torch torchvision transformers xformers
from datasets import load_dataset
from functools import partial
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import multiprocessing as mp
import os
import pandas as pd
import torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Collecting datasets
  Using cached datasets-2.13.1-py3-none-any.whl (486 kB)
Collecting emoji
  Using cached emoji-2.5.1.tar.gz (356 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sklearn
  Using cached sklearn-0.0.post5.tar.gz (3.7 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the pac

ModuleNotFoundError: ignored

In [None]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# Load and process data
def load_and_process_data():
    if os.path.exists("bad_reviews.csv"):
       bad_reviews = pd.read_csv("bad_reviews.csv")
    else:
      dataset = load_dataset("amazon_us_reviews", "Digital_Video_Games_v1_00")
      data_df = pd.DataFrame.from_dict(dataset['train'])
      bad_reviews = data_df.loc[(data_df["star_rating"] == 1) & (data_df['review_body'].str.len() < 128)].copy()
      if bad_reviews.empty:
          print("No reviews meet the criteria. Please check your filtering process.")
          return None
      bad_reviews.to_csv("bad_reviews.csv")
      return bad_reviews

In [None]:
# Sentiment analysis
def sentiment_analysis(bad_reviews):
    # Check if cached results exist
    if os.path.exists("sentiment_analysis_results.csv"):
        # Load cached results
        bad_reviews = pd.read_csv("sentiment_analysis_results.csv")
    else:
        # Perform sentiment analysis
        model_name = "finiteautomata/bertweet-base-sentiment-analysis"
        sentiment_pipeline = pipeline("text-classification", model=model_name, device=device)

        sentiments = sentiment_pipeline(bad_reviews['review_body'].tolist())

        sentiments_df = pd.DataFrame(sentiments)
        bad_reviews.reset_index(inplace=True, drop=True)
        sentiments_df.reset_index(inplace=True, drop=True)
        # Add sentiment label and score to bad_reviews DataFrame
        bad_reviews['sentiment_label'] = sentiments_df['label']
        bad_reviews['sentiment_score'] = sentiments_df['score']

        # Filter bad_reviews based on sentiment label and score
        bad_reviews = bad_reviews.loc[(bad_reviews['sentiment_label'].isin(['NEGATIVE', 'NEU'])) & (bad_reviews['sentiment_score'] > 0.7)]

        # Cache results
        bad_reviews.to_csv("sentiment_analysis_results.csv")

        return bad_reviews

In [None]:
# Save and load data
def save_and_load_data(bad_reviews):
    bad_reviews.to_csv("cleaner_data.csv")
    cleaned_data = pd.read_csv("cleaner_data.csv")
    return cleaned_data

In [None]:
def load_and_train_model(cleaned_data):
    if cleaned_data.empty:
        print("The cleaned data is empty. Please check your data cleaning process.")
        return None
    else:
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        # Check if trained model file exists
        if os.path.exists("trained_model.pt"):
            # Load trained model from file
            model = AutoModelForCausalLM.from_pretrained("trained_model.pt")
        else:
            # Train model
            model = AutoModelForCausalLM.from_pretrained("gpt2")

            # Split data into training and test sets
            train, test = train_test_split(cleaned_data.review_body, test_size = 0.15)
            train.to_csv("train.csv")
            test.to_csv("test.csv")

            # Load datasets
            train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.csv", block_size=128)
            test_dataset = TextDataset(tokenizer=tokenizer, file_path="test.csv", block_size=128)
            data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

            # Define training arguments
            training_args = TrainingArguments(
                output_dir="logs",
                overwrite_output_dir=True,
                num_train_epochs=3,
                per_device_train_batch_size=32,
                per_device_eval_batch_size=64,
                eval_steps = 400,
                save_steps=800,
                warmup_steps=500,
                prediction_loss_only=True,
            )

            # Train the model
            trainer = Trainer(
                model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
            )
            trainer.train()
            trainer.save_model("trained_model.pt")

        return model

In [None]:
# Generate text
def generate_text(model):
    if model is not None:
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        generator = pipeline('text-generation', model=model.to(device), tokenizer=tokenizer, device=device.index)

        prompt1 = "I HATE MY JOB"
        prompt2 = "Nobody understands me"

        print(generator(prompt1, max_length=150, num_return_sequences=3))
        print(generator(prompt2, max_length=150))

In [None]:
bad_reviews = load_and_process_data()
bad_reviews = sentiment_analysis(bad_reviews)
if bad_reviews is not None:
    cleaned_data = save_and_load_data(bad_reviews)
    model = load_and_train_model(cleaned_data)
    generate_text(model)