**Finetuning GPT2**

In [None]:
import numpy as np
import pandas as pd
import torch
import logging
from tqdm import tqdm
import math
import argparse
import os
!git clone https://github.com/huggingface/transformers
!pip install transformers/
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=88888)
parser.add_argument("--model_name", default="gpt2-medium", type=str)
parser.add_argument("--max_seq_length", default=512, type=int)
parser.add_argument("--train_batch_size", default=4, type=int)
parser.add_argument("--valid_batch_size", default=4, type=int)
parser.add_argument("--num_train_epochs", default=4, type=int)
parser.add_argument("--warmup", default=0.1, type=float)
parser.add_argument("--learning_rate", default=5e-5, type=float)

args, _ = parser.parse_known_args()

In [None]:
excel_file = '/content/IDEST_database.xlsx'
idest_df = pd.read_excel(excel_file)
selected_cols = ['text_english', 'tags']
df_selected = idest_df[selected_cols]
df_selected['tags'] = df_selected['tags'].str.replace(';', ' and')
df_selected

In [None]:
def combinetext(prompt, story):
    prompts=prompt
    stories=story
    assert len(prompts)==len(stories)
    combine=[]
    for i in range(len(prompts)):
        combine.append('From the first-person perspective, write an emotional narrative with maximum 200 words about: ' + prompts[i].rstrip()+' <sep> '+" ".join(stories[i].split()[:300]))
    return combine

def cleanpunctuation(s):
    s=s.replace(' '+'n\'t','n\'t')
    s=s.replace(' '+'\'s','\'s')
    s=s.replace(' '+'\'re','\'re')
    s=s.replace(' '+'\'ve','\'ve')
    s=s.replace(' '+'\'ll','\'ll')
    s=s.replace(' '+'\'am','\'am')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' m','\'m')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' ve','\'ve')
    s=s.replace(' '+'\' s','\'s')
    s=s.replace('<newline>','\n')
    return s

In [None]:
fullData=combinetext(df_selected['tags'], df_selected['text_english'])
fullData=list(map(cleanpunctuation,fullData))

In [None]:
fullData

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token=tokenizer.eos_token
def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels


In [None]:
class StoryDataset:
    def __init__(self, inputs):
        self.ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels=inputs['labels']

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, item):

        return [torch.tensor(self.ids[item], dtype=torch.long),
                torch.tensor(self.attention_mask[item], dtype=torch.long),
                torch.tensor(self.labels[item], dtype=torch.long)]

In [None]:
num_train_epochs = args.num_train_epochs

weight_decay=0
learning_rate=args.learning_rate
adam_epsilon=1e-8
warmup_steps=500
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

print("***** Running training *****")
#print("  Total_num_training_step = {}".format(total_num_training_steps))
print("  Num Epochs = {}".format(num_train_epochs))
#print(f"  Train_batch_size per device = {train_batch_size}")
#print(f"  Valid_batch_size per device = {valid_batch_size}")
model.to('cuda')
for fold, (train_idx, val_idx) in enumerate(kfold.split(fullData)):
  # Print the current fold number
  print(f"Fold {fold+1}/5")
  print(type(train_idx))
  print(type(fullData))
  train_text = [fullData[i] for i in train_idx]
  valid_text = [fullData[i] for i in val_idx]

  inputs_train = tokenizer(train_text, padding=True,truncation=True,max_length=args.max_seq_length)
  inputs_valid=tokenizer(valid_text, padding=True,truncation=True,max_length=args.max_seq_length)

  create_labels(inputs_train)
  create_labels(inputs_valid)
  train_batch_size=args.train_batch_size
  valid_batch_size=args.valid_batch_size
  traindata=StoryDataset(inputs_train)
  train_dataloader = torch.utils.data.DataLoader(
    traindata,
    shuffle=False,
    batch_size=train_batch_size)

  validdata=StoryDataset(inputs_valid)
  valid_dataloader = torch.utils.data.DataLoader(
    validdata,
    shuffle=False,
    batch_size=valid_batch_size)
  training_steps_per_epoch=len(train_dataloader)
  total_num_training_steps = int(training_steps_per_epoch*num_train_epochs)

  scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_num_training_steps
  )

  for epoch in range(num_train_epochs):
      print(f"Start epoch{epoch+1} of {num_train_epochs}")
      train_loss=0
      epoch_iterator = tqdm(train_dataloader,desc='Iteration')
      model.train()
      model.zero_grad()
      for _, inputs in enumerate(epoch_iterator):
          d1,d2,d3=inputs
          d1=d1.to('cuda')
          d2=d2.to('cuda')
          d3=d3.to('cuda')
          output = model(input_ids=d1, attention_mask=d2,labels=d3)
          batch_loss=output[0]
          batch_loss.backward()
          optimizer.step()
          scheduler.step()
          model.zero_grad()
          train_loss+=batch_loss.item()
          epoch_iterator.set_description('(batch loss=%g)' % batch_loss.item())
          del batch_loss
      print(f'Average train loss per example={train_loss/training_steps_per_epoch} in epoch{epoch+1}')
      print(f'Starting evaluate after epoch {epoch+1}')
      eval_loss=[]
      model.eval()
      for inputs in tqdm(valid_dataloader, desc="eval"):
          d1,d2,d3=inputs
          d1=d1.to('cuda')
          d2=d2.to('cuda')
          d3=d3.to('cuda')
          with torch.no_grad():
              output = model(input_ids=d1, attention_mask=d2,labels=d3)
              batch_loss=output[0]
          eval_loss+=[batch_loss.cpu().item()]
          del batch_loss
      eval_loss=np.mean(eval_loss)
      perplexity=math.exp(eval_loss)
      print(f'Average valid loss per example={eval_loss} in epoch{epoch+1}')
      print(f'Perplextiy for valid dataset in epoch{epoch+1} is {perplexity}')

In [None]:
def generate_story(prompt,target,k=0,p=0.9,output_length=200,temperature=0.7,num_return_sequences=3,repetition_penalty=1.0):
    print("====prompt====\n")
    print(prompt+"\n")
    print('====target story is as below===\n')
    print(target+"\n")
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    model.to('cpu')
    model.eval()
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=output_length,
        temperature=temperature,
        top_k=k,
        top_p=p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences
    )
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()
        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        # Remove all text after eos token
        text = text[: text.find(tokenizer.eos_token)]
        print(text)


In [None]:
prompt='From the first-person perspective, write an emotional narrative with maximum 200 words about: planets'
target=fullData[118][fullData[118].find('<sep>')+5:]
generate_story(prompt,target)

In [None]:
def generate_story_inportmodel(prompt,k=0,p=0.9,output_length=219,temperature=0.7,num_return_sequences=1,repetition_penalty=1.0):
    print("====prompt====\n")
    print(prompt+"\n")
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    model.to('cpu')
    model.eval()
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=output_length,
        temperature=temperature,
        top_k=k,
        top_p=p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences
    )
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()
        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        # Remove all text after eos token
        text = text[: text.find(tokenizer.eos_token)]
        print(text)
        return text

In [None]:
from google.colab import files

In [None]:
gpt2_df = pd.DataFrame(columns=["tag","story"])
for tag in tags:
#for tag in df_selected['tags']:
  prompt='From the first-person perspective, write an emotional narrative with maximum 200 words about: ' + tag
  story = generate_story_inportmodel(prompt)
  gpt2_df = gpt2_df.append({"tag": tag,"story": story}, ignore_index=True)

# Save the updated DataFrame to the CSV file
csv_file_path = "GPT2-new-finetune.csv"
gpt2_df.to_csv(csv_file_path, index=False)

# Download the CSV file to your local machine
files.download(csv_file_path)

In [None]:
# preprocess the stories by removing the begining sentence of the prompt
import re
for i, story in enumerate(gpt2_df['story']):
  new_story = re.sub(r".*?<sep>", "", story)
  gpt2_df.loc[i, "story"] = new_story
print(gpt2_df)

**sentiment analysis**

Sentiment analysis with sliding window approach

IDEST data example

In [None]:
#data preprocessing
story = idest_df['text_english']
sentiment_map = { 1: 'Constant',  2: 'Tragedy', 3 : 'Rags-to-riches',  4 : 'Man-in-a-hole ',  5 :'Icarus',  6 : 'Oedipus',  7 : 'Cinderella', 8 : 'No clear story' }
# Reverse the key-value pairs in the dictionary
sentiment_map = {str(key): value for key, value in sentiment_map.items()}
idest_df['StoryType'] = idest_df['StoryType'].astype(str).replace(sentiment_map)

#tokenize the stories using NLTK
import nltk
stories = []
input_sentences = []
nltk.download('punkt')

for item in story:
  stories.append(item)
from nltk.tokenize import sent_tokenize

for story in stories:
  input_sentences.append(sent_tokenize(story))

In [None]:
#sentiment analysis using the sliding window approach
! pip install torch
sentiment_data = []

# Convert sentences and labels to numpy arrays

for story_index, story in enumerate(input_sentences):
    sentences = list(story)

    # Define window size and stride
    window_size = 2
    stride = 1

    # Create sliding windows of sentences
    windows = [sentences[i:i + window_size] for i in range(0, len(sentences) - window_size + 1, stride)]

    # Tokenize and encode the windows
    encoded_windows = []
    for window in windows:
        encoded = tokenizer.encode_plus(
            window,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        encoded_windows.append(encoded)

    # Perform inference on the windows
    model.eval()
    sentiment_labels = []
    sentiment_probabilities = []

    with torch.no_grad():
      for encoded_window in encoded_windows:
        input_ids = encoded_window['input_ids']
        attention_mask = encoded_window['attention_mask']

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits).item()

        probabilities = torch.softmax(logits, dim=1)
        predicted_probability = probabilities[0][1].item()


        # Map the predicted label to the corresponding sentiment category
        sentiment_labels.append(predicted_label)

        # Store the sentiment probability in the list
        sentiment_probabilities.append(predicted_probability)

    # Store the data in a dictionary
    for i, window in enumerate(windows[:min(len(sentiment_labels), len(windows))]):
        sentiment_data.append({
            'Story': f"Story {story_index + 1}",
            'Sentences': ' '.join(window),
            'Sentiment Label': sentiment_labels[i],
            'Sentiment Probability': sentiment_probabilities[i]
        })

# Convert the data to a pandas DataFrame for tabular representation
human_story_sentiment = pd.DataFrame(sentiment_data)

# Print the DataFrame
print(len(human_story_sentiment))



In [None]:
# Group by 'Story' and calculate the average 'Sentiment Probability'
grouped_human = human_story_sentiment.groupby('Story')['Sentiment Probability'].mean().reset_index()

# Set 'Sentiment Label' to 1 if 'Sentiment Probability' is >= 0.5 , 0 otherwise
grouped_human['Sentiment Label'] = (grouped_human['Sentiment Probability'] >= 0.5).astype(int)

In [None]:
# Merge 'human' with 'grouped_human' based on 'Story' to get 'Sentiment Label' information for each story
merged_human = human_story_sentiment.merge(grouped_human[['Story', 'Sentiment Label']], on='Story', how='left')

# Filter the entries with 'Sentiment Label' equal to 1
stories_with_sentiment_label_1 = merged_human[merged_human['Sentiment Label_y'].eq(1)]
stories_with_sentiment_label_0 = merged_human[merged_human['Sentiment Label_y'].eq(0)]


In [None]:
##positive stories
stories_with_sentiment_label_1['Sliding Window'] = stories_with_sentiment_label_1.groupby('Story').cumcount() + 1
stories_with_sentiment_label_1

In [None]:
import matplotlib.pyplot as plt

# Group by 'Sliding Window' and calculate the average 'Sentiment Probability' for each window
average_probability_by_window = stories_with_sentiment_label_1.groupby('Sliding Window')['Sentiment Probability'].mean()

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(average_probability_by_window.index, average_probability_by_window.values, marker='o', linestyle='-')
plt.xlabel('Sliding Window')
plt.ylabel('Average Sentiment Probability')
plt.title('Positive Human Stories - Average Sentiment Probability for Each Sliding Window')
plt.grid(True)


plt.ylim(0, 1.1)

plt.show()

In [None]:
#negative stories
stories_with_sentiment_label_0['Sliding Window'] = stories_with_sentiment_label_0.groupby('Story').cumcount() + 1
stories_with_sentiment_label_0

In [None]:
# Group by 'Sliding Window' and calculate the average 'Sentiment Probability' for each window
average_probability_by_window = stories_with_sentiment_label_0.groupby('Sliding Window')['Sentiment Probability'].mean()

plt.figure(figsize=(10, 6))
plt.plot(average_probability_by_window.index, average_probability_by_window.values, marker='o', linestyle='-')
plt.xlabel('Sliding Window')
plt.ylabel('Average Sentiment Probability')
plt.title('Negative Human Stories - Average Sentiment Probability for Each Sliding Window')
plt.grid(True)

plt.ylim(0, 1.1)

plt.show()


In [None]:
# Group by 'Sliding Window' and calculate the average 'Sentiment Probability' for each window
average_probability_positive = stories_with_sentiment_label_1.groupby('Sliding Window')['Sentiment Probability'].mean()
average_probability_negative = stories_with_sentiment_label_0.groupby('Sliding Window')['Sentiment Probability'].mean()

# Plot with two subplots side by side
plt.figure(figsize=(15, 6))

# Plot for positive stories
plt.subplot(1, 2, 1)
plt.plot(average_probability_positive.index, average_probability_positive.values, marker='o', linestyle='-',color='blue')
plt.xlabel('Sliding Window')
plt.ylabel('Average Sentiment Probability')
plt.title('Positive Human Stories - Average Sentiment Probability for Each Sliding Window')
plt.grid(True)
plt.ylim(0, 1.1)

# Plot for negative stories
plt.subplot(1, 2, 2)
plt.plot(average_probability_negative.index, average_probability_negative.values, marker='o', linestyle='-',color='red')
plt.xlabel('Sliding Window')
plt.ylabel('Average Sentiment Probability')
plt.title('Negative Human Stories - Average Sentiment Probability for Each Sliding Window')
plt.grid(True)
plt.ylim(0, 1.1)

plt.tight_layout()

plt.show()


In [None]:
# Group by 'Sliding Window' and collect the 'Sentiment Probability' for each window as lists
data_positive = stories_with_sentiment_label_1.groupby('Sliding Window')['Sentiment Probability'].apply(list)
data_negative = stories_with_sentiment_label_0.groupby('Sliding Window')['Sentiment Probability'].apply(list)

# Convert the index to a 1-dimensional array using .to_numpy() or .values
positions_positive = average_probability_positive.index.to_numpy()
positions_negative = average_probability_negative.index.to_numpy()

# Create the violin plot for positive stories
plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
plt.violinplot(data_positive, positions=positions_positive, showmedians=True, vert=True, widths=0.7, points=100)
plt.plot(average_probability_positive.index, average_probability_positive.values, marker='o', linestyle='-', color='blue', label='Average')
plt.xlabel('Sliding Window')
plt.ylabel('Sentiment Probability')
plt.title('Positive Human Stories - Sentiment Probability Distribution for Each Sliding Window')
plt.grid(True)
plt.ylim(0, 1.1)
plt.legend()

# Create the violin plot for negative stories
plt.subplot(2, 1, 2)  # 2 rows, 1 column, plot 2 (bottom)
plt.violinplot(data_negative, positions=positions_negative, showmedians=True, vert=True, widths=0.7, points=100)
plt.plot(average_probability_negative.index, average_probability_negative.values, marker='o', linestyle='-', color='red', label='Average')
plt.xlabel('Sliding Window')
plt.ylabel('Sentiment Probability')
plt.title('Negative Human Stories - Sentiment Probability Distribution for Each Sliding Window')
plt.grid(True)
plt.ylim(0, 1.1)
plt.legend()
plt.tight_layout()
plt.show()
