In [1]:
# Required imports
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import time 

from transformers import AutoTokenizer
import transformers
import torch

import numpy as np


# Logging to hugging face 
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hf_token")

login(token=secret_value_0)

# Loading LLAMA
model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

### In order to safeguard our discriminatory model from learning titles of good movies instead of what makes a positive review we first generate the list of 50 movies. We will then ask LLAMA to generate positive reviews of those movies

In [2]:
start_time = time.time()

# Generating movie titles
sequences = pipeline(
    # write a list of 50 real movie titles. Include both good and mediocre movies.
    "write a list of 50 real movie titles. Include both good and mediocre movies. Include different genres of movies (don't forget horror).",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=6000,
)

# Inspecting the results
for seq in sequences:
    print(seq['generated_text'])
    print("=================================================")
print(time.time()-start_time)
print("=================================================")
movie_list = [".".join(x.split(".")[1:]) for x in ("1."+seq['generated_text'].split("\n1.")[1]).split("\n") ][:-2]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


write a list of 50 real movie titles. Include both good and mediocre movies. Include different genres of movies (don't forget horror).

Here are 50 real movie titles:

1. The Shawshank Redemption (drama)
2. The Godfather (crime/drama)
3. The Dark Knight (superhero/action)
4. 12 Angry Men (drama)
5. The Silence of the Lambs (horror/thriller)
6. Forrest Gump (drama/romance)
7. The Matrix (sci-fi/action)
8. Pulp Fiction (crime/drama)
9. The Lord of the Rings: The Fellowship of the Ring (fantasy)
10. The Good, the Bad and the Ugly (western)
11. Jaws (horror/thriller)
12. Rear Window (thriller/romance)
13. The Terminator (sci-fi/action)
14. The Princess Bride (fantasy/romance)
15. The Sound of Music (musical/drama)
16. Star Wars: Episode IV - A New Hope (sci-fi/action)
17. The Good Son (thriller/drama)
18. The Green Mile (fantasy/drama)
19. The Lion King (animated/musical)
20. The Color Purple (drama/romance)
21. The Usual Suspects (crime/thriller)
22. Fargo (crime/drama)
23. The Big Lebows

## Positive movie review generation

In [3]:
def get_time(time):
    """ Method converts number of seconds into time in format ___ h __ m __.__ s
    Args:
        time (float): Number of seconds
    Returns:
        str: Time in format ___ h __ m __.__ s
    """
    result = ""
    if time//3600 > 0:
        result += str(int(time//3600)) + " h  "
        time %= 3600
    if time//60 > 0:
        result += str(int(time//60)) + " m  "
        time %= 60
    if time//1 > 0:
        result += str(np.round(time,2)) + " s                      "
    return result

In [4]:

review_num = 1000
reviews = [0 for _ in range(review_num)]

start_time = time.time()
for i in range(review_num):
    
    # Construction of a query + selection of movie to review
    movie = movie_list[np.random.randint(0,len(movie_list))]
    # You are a movie critic. Write positive review of {movie} (maximise review length (at least 200 tokens) and minimise other text, notes, etc) !!!answer in a given format!!!:"<title> \n Rating: <rating score out of 10> \n Review:<your movie review>"<End_token>
    query = f'You are a movie critic such as Roger Ebert. Write positive review of {movie} (maximise review length (at least 200 tokens) and minimise other text, notes, etc) !!!answer in a given format!!!:"<title> \n Rating: <rating score out of 10> \n Review:<your movie review>"<End_token>'
    
    # Movie review generation
    sequences = pipeline(
        query,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=6000,
    )

    # Scrapping the review from LLAMA's response
    for seq in sequences:
        review = seq['generated_text']
        if "<End_token>" in seq['generated_text']:
            review = review.split("<End_token>")[1].split("Note:")[0]
            if 'Review:' in review:
                review = review.split("Review:")[1]
        reviews[i] = review

    # Printing estimated remaining time
    rem_time = (time.time()-start_time)/(i+1)*(review_num-i)
    print(get_time(rem_time), end = "\r")

6 h  26 m  10.53 s                      

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


24.93 s                                 

In [5]:
import pandas as pd
df = pd.DataFrame(data={"text":reviews})
df.to_csv("llama_generated2.csv")

In [6]:
df.text = df.text.apply(lambda x:x.replace("\n"," "))
df.to_csv("llama_generated_final2.csv")

In [7]:
df

Unnamed: 0,text
0,"The Help, directed by Tate Taylor, is a poign..."
1,"The Expendables is a non-stop, adrenaline-fue..."
2,The Usual Suspects is a gripping crime thrill...
3,"The Hunger Games: Catching Fire, the second ..."
4,The Big Sick is a heartwarming romantic come...
...,...
995,"Matt Damon shines in this gripping, action-pa..."
996,Jaws is a masterpiece of suspense and horror ...
997,"From its opening shot of a lonesome, amphib..."
998,A groundbreaking masterpiece that redefined ...
