In [None]:
# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 numpy==1.23.4 --force-reinstall --upgrade --no-cache-dir --verbose
!pip install huggingface_hub
!pip install llama-cpp-python==0.1.78
!pip install numpy==1.23.4

In [None]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import re
import pandas as pd

In [None]:
df = pd.read_csv('/content/cleaned_data.csv')

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format

In [None]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [None]:
# GPU
lcpp_llm = None
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2, # CPU cores
    n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool.
    )

In [None]:
# List of columns to drop
columns_to_drop = ['Rating', 'Additional Tags']

# Use the drop method to remove the specified columns
df.drop(columns=columns_to_drop, inplace=True)

# Save the DataFrame back to the CSV file without the dropped columns
# df.to_csv('your_updated_file.csv', index=False)

In [None]:
column_data = df['Content']

In [None]:
modified_content = []
for i in column_data:
  sentence = str(i)[:500]
  prompt = f"""
  Generate the modified version of the given story:
  {sentence}
  """
  prompt_template=f'''SYSTEM: You are a helpful and creative story generator assistant that reduces the given story into meaningful story in the word count range seventy to eighty.

  USER: {prompt}

  ASSISTANT:
  '''
  response=lcpp_llm(prompt=prompt_template, max_tokens=250, temperature=0.5, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)
  text = response["choices"][0]["text"]
  modified_content.append(text)


In [None]:
def process_content(text):
    # Split the text into sentences using periods as delimiters
    sentences = text.split('.')

    # Initialize variables to keep track of the word count and the selected sentences
    selected_sentences = []
    current_word_count = 0

    # Iterate through the sentences and add them to the selected_sentences list until reaching almost 75 words
    for sentence in sentences:
        # Count the words in the current sentence by splitting on spaces
        words = sentence.strip().split()
        word_count = len(words)

        # If adding the current sentence doesn't exceed 75 words, add it to the selected_sentences
        if current_word_count + word_count <= 75:
            selected_sentences.append(sentence)
            current_word_count += word_count
        else:
            # Break the loop if adding the sentence would exceed 75 words
            break

    # Join the selected sentences to create a text with nearly 75 words
    result_text = '.'.join(selected_sentences)

    return result_text

In [None]:
df['Modified'] = df['Content'].apply(process_content)

In [None]:
df.drop(columns=['Modified'], inplace=True)

In [None]:
column_data = df['Modified']

In [None]:
genres = []
for i in column_data:
  sentence = str(i)
  prompt = f"""
  Write the genre in a single word:
  {sentence}
  """
  prompt_template=f'''SYSTEM: You are a helpful and creative story classifier assistant that gives a one word genre from the given story.

  USER: {prompt}

  ASSISTANT:
  '''
  response=lcpp_llm(prompt=prompt_template, max_tokens=30, temperature=0.5, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)
  genre = response["choices"][0]["text"]
  pattern = r'ASSISTANT:\n(.*?)(?=\n)'
  matches = re.findall(pattern, genre, re.DOTALL)
  for match in matches:
    genres.append(match.strip())

In [None]:
df['Genre'] = genres

In [None]:
column_data = df['Modified']

In [None]:
context_list = []
for i in column_data:
  story = i
  prompt = f"""
  Write the context of the given short story in about 5 words, without saying the name of any characters and just giving a general outline of the situation ;
  {story}
  """
  prompt_template=f'''SYSTEM: You are a helpful and creative context generator assistant. You produce only the output without saying useless formalities.

  USER: {prompt}

  ASSISTANT:
  '''
  response=lcpp_llm(prompt=prompt_template, max_tokens=30, temperature=1, top_p=0.95,
                  repeat_penalty=1.2, top_k=150,
                  echo=True)
  context = response["choices"][0]["text"]
  print(response["choices"][0]["text"])
  assistant_response = response["choices"][0]["text"].split("ASSISTANT:")[1].strip()
  context_list.append(assistant_response)

In [None]:
df['Context'] = context_list

In [None]:
def remove_non_alphanumeric(text):
    # Use regular expression to remove non-alphanumeric characters
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

In [None]:
# Apply the function to a specific column (e.g., 'your_column_name')
df['Context'] = df['Context'].apply(remove_non_alphanumeric)

In [None]:
df.to_csv("/content/output.csv")

In [None]:
# Initialize an empty list to store the formatted rows
formatted_rows = []

# Iterate through the rows of the DataFrame
for index, row in df.iterrows():
    # Extract the "genre" and "content" values
    genre = row['Genre']
    content = row['Modified']
    context = row['Context']

    # Create the formatted text
    formatted_text = f"### Human: Create a short story of about seventy-five words on the genre: {genre} and context: {context} . ### Assistant: {content}"

    # Append the formatted text to the list
    formatted_rows.append(formatted_text)

In [None]:
df['Text'] = formatted_rows
df = df.drop(columns=['URL', 'Genre', 'Modified', 'Context'])
df.to_csv('/content/train.csv', index=False)