In [None]:
%%capture
!pip install -U sentence-transformers
!pip install repeng
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git


IMPORT LIBRARIES

In [None]:
%timeit
import pandas as pd
import requests
import gzip
import shutil
import os
from google.colab import drive
import torch
import warnings
import random
import json
import torch
import re
import numpy
import bitsandbytes
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from repeng import ControlVector, ControlModel, DatasetEntry

# Data loading
1.   Downloads, Extracts, and Preprocesses IMDb Movie Data
2.   Fetches compressed movie datasets from IMDb's repositories.
3.   Unzips the downloaded files.
4.  Loads the data into Pandas dataframes.
5.  Combines the three dataframes to include movie ratings and movie crew.











In [None]:
%timeit
# Function to download and extract the files
def download_and_extract(url, file_name):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_name, 'wb') as file:
            file.write(response.raw.read())
        # Extract the .gz file
        with gzip.open(file_name, 'rb') as f_in:
            with open(file_name[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(file_name)  # Remove the .gz file after extraction

# URLs for download
url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
url_2 = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
url_3 = 'https://datasets.imdbws.com/title.crew.tsv.gz'
url_4 = 'https://datasets.imdbws.com/name.basics.tsv.gz'
url_5 = 'https://datasets.imdbws.com/title.akas.tsv.gz'

# Download and extract all data
download_and_extract(url, 'title.basics.tsv.gz')
download_and_extract(url_2, 'title.ratings.tsv.gz')
download_and_extract(url_3, 'title.crew.tsv.gz')
download_and_extract(url_4, 'name.basics.tsv.gz')
download_and_extract(url_5, 'title.akas.tsv.gz')

# All data goes to dataframes
movies_basics = pd.read_csv('title.basics.tsv', delimiter='\t')
movies_ratings = pd.read_csv('title.ratings.tsv', delimiter='\t')
movies_crew = pd.read_csv('title.crew.tsv', delimiter='\t')
names_of_crew = pd.read_csv('name.basics.tsv', delimiter='\t')
titles_basics = pd.read_csv('title.akas.tsv', delimiter='\t')

# Dropping some columns that I dont need from titles_basics and renaming
titles_basics = titles_basics.drop(columns=['ordering', 'region', 'isOriginalTitle'])
titles_basics = titles_basics.rename(columns={'titleId': 'tconst'})

#movies_crew filtering
names_of_crew = names_of_crew[['nconst', 'primaryName']]
movies_crew = movies_crew[['tconst', 'directors']]
movies_crew = movies_crew.rename(columns={'directors': 'nconst'})

# title type filtering


#Merging all dataframes into one movies_basics
movies_crew = movies_crew.merge(names_of_crew, on='nconst', how='left')
movies_basics = movies_basics.merge(titles_basics, on='tconst', how='left')
movies_basics = movies_basics.merge(movies_ratings, on='tconst', how='left')
movies_basics = movies_basics.merge(movies_crew, on='tconst', how='left')

# Removing rows where I analyzed high ammount of missing values
columns_to_keep = [
    'primaryTitle', 'genres', 'titleType', 'isAdult', 'startYear',
    'averageRating', 'numVotes', 'primaryName',
    'language'
]
# Drop columns that are not in the list of columns to keep + numVotes above 200 so I dont have any Indian telenovels reccomended
movies_basics = movies_basics[columns_to_keep]
movies_basics = movies_basics[movies_basics['numVotes'] >= 200]

# Dropping missing values + different types
movies_basics = movies_basics[movies_basics['titleType'].isin(["short", "movie", "tvMovie", "tvSeries"])]
movies_basics.replace(r'\\N', numpy.nan, regex=True, inplace=True)
movies_basics.dropna(inplace=True)


# Sorting and selecting top 50k movies, since whole dataset might be around 1mil rows and its too high, will experiment with this number
movies_basics = movies_basics.sort_values(by='averageRating', ascending=False)
movies_basics = movies_basics.head(50000)




  movies_basics = pd.read_csv('title.basics.tsv', delimiter='\t')


# Converting information about movies to sentences
Here I convert information about movies to simple sentences

In [None]:
%timeit
def create_description(row):
    title = row['primaryTitle']
    genres = row['genres']
    director = row['primaryName']
    film_type = row['titleType']
    release_year = row['startYear']
    is_adult = row['isAdult']
    average_rating = row['averageRating']
    num_votes = row['numVotes']

    # Start with the title and basic info
    description = f"The {film_type} '{title}', released in {release_year}, is a {genres} feature directed by {director}. "

    # Add the adult rating info if applicable
    if is_adult == 'True':
        description += "It is intended for mature audiences and "

    # Finish with the IMDb rating info
    description += f"has an average IMDb rating of {average_rating}, based on {num_votes} votes."

    return description

# Apply the function to the movies DataFrame
movies_basics['description'] = movies_basics.apply(create_description, axis=1)
sentences = movies_basics['description'].tolist()


# Sentence model test

In [None]:
%timeit
# SETTING UP Sentence model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# @title Click `Show code` in the code cell. { display-mode: "form" }
query = "What movie could u suggest me if I like Star Wars?" # @param {type:"string"}
threshold = 0.4 # @param {type:"slider", min:0, max:1, step:0.1}

embedding_query= model.encode(query, convert_to_tensor=True)

scores = []
for sentence in sentences:
    embedding_sentence = model.encode(sentence, convert_to_tensor=True)
    scores.append(util.pytorch_cos_sim(embedding_query, embedding_sentence))
flat_tensor = torch.cat(scores, dim=0).flatten()
index_of_max = torch.argmax(flat_tensor)

best_score = scores[index_of_max].item()
best_context = sentences[index_of_max]

if best_score > threshold:
    output_sentence = f"Score: {best_score}\nText: {best_context}"
else:
    output_sentence = "No suitable context found"

print(output_sentence)
print(type(output_sentence))
print(type("output sentence"))
del(model)

movie_name_match = re.search(r"'(.*?)'", output_sentence)
movie_name = movie_name_match.group(1) if movie_name_match else ""

# Extract director's name
director_name_match = re.search(r"directed by (.*?)\.", output_sentence)
director_name = director_name_match.group(1) if director_name_match else ""

# Create a new variable with the extracted information
movie_info = f"{movie_name} directed by {director_name}"
movie_info_question = "Write me a reccomendaation for " + movie_info + "?"
# Print the new variable
print(movie_info_question)


Score: 0.5965897440910339
Text: The movie 'Star Wars: Episode VI - Return of the Jedi', released in 1983, is a Action,Adventure,Fantasy feature directed by Richard Marquand. has an average IMDb rating of 8.3, based on 1116347.0 votes.
<class 'str'>
<class 'str'>
Write me a reccomendaation for Star Wars: Episode VI - Return of the Jedi directed by Richard Marquand?


In [None]:
print(scores[:5])

[tensor([[0.2189]], device='cuda:0'), tensor([[0.1826]], device='cuda:0'), tensor([[0.2494]], device='cuda:0'), tensor([[0.1692]], device='cuda:0'), tensor([[0.1937]], device='cuda:0')]


# Deleting Unnecessary varibles

# Model 2 set up

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
model = ControlModel(model, list(range(-5, -18, -1)))

user_tag, asst_tag = "[INST]", "[/INST]"

with open("/content/drive/MyDrive/mixtral/data/all_truncated_outputs.json") as f:
    output_suffixes = json.load(f)
truncated_output_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes)
    for i in range(1, len(tokens))
]
truncated_output_suffixes_512 = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes[:512])
    for i in range(1, len(tokens))
]

with open("/content/drive/MyDrive/mixtral/data/true_facts.json") as f:
    fact_suffixes = json.load(f)
truncated_fact_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in fact_suffixes)
    for i in range(1, len(tokens) - 5)
]

def make_dataset(
    template: str,
    positive_personas: list[str],
    negative_personas: list[str],
    suffix_list: list[str]
) -> list[DatasetEntry]:
    dataset = []
    for suffix in suffix_list:
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            positive_template = template.format(persona=positive_persona)
            negative_template = template.format(persona=negative_persona)
            dataset.append(
                DatasetEntry(
                    positive=f"{user_tag} {positive_template} {asst_tag} {suffix}",
                    negative=f"{user_tag} {negative_template} {asst_tag} {suffix}",
                )
            )
    return dataset


def generate_with_vector(
    input: str,
    vector: ControlVector,
    coeffs: tuple[float, float],
    max_new_tokens: int = 128,
    repetition_penalty: float = 1.5,
    show_baseline: bool = True,
):
    positive_coeff, negative_coeff = coeffs
    assert positive_coeff > 0
    assert negative_coeff < 0

    if user_tag not in input:
        input = f"{user_tag} {input.strip()} {asst_tag}"
    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id, # silence warning
        "do_sample": False, # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("==baseline ---------------------------------------------------")
        model.reset()
        print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())

    print("\n++control ---------------------------------------------------")
    model.set_control(vector, positive_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())

    print("\n--control ---------------------------------------------------")
    model.set_control(vector, negative_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    model.reset()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
movie_critic = make_dataset(
    "Act as if you're a {persona} movie critic recommending movies to your audience.",
    ["enthusiastic movie crtitic, who loves cinematography and highlighting the strengths and merits of movies"],
    ["highly critical, focusing on the flaws and shortcomings of movies, searching problems on everything"],
    truncated_output_suffixes,
)



In [None]:
model.reset()
movies_critic_vector = ControlVector.train(model, tokenizer, movie_critic)


100%|██████████| 78/78 [09:26<00:00,  7.26s/it]
100%|██████████| 31/31 [00:27<00:00,  1.15it/s]


In [None]:
answers = generate_with_vector(
    movie_info_question,
    movies_critic_vector,
    (1.5, -2.2),
    max_new_tokens=256,
    repetition_penalty=1.3,
)

==baseline ---------------------------------------------------
<s> [INST] Write me a reccomendaation for Star Wars: Episode VI - Return of the Jedi directed by Richard Marquand? [/INST] I highly recommend "Star Wars: Episode VI - Return of the Jedi" directed by Richard Marquand. This iconic film is the final installment in George Lucas' original trilogy and brings together all the elements that have made this franchise so beloved over the years. The storyline, characters, special effects, music, and action sequences are all top-notch, making it an unforgettable cinematic experience. Whether you're a longtime fan or new to the series, this movie will not disappoint!</s>

++control ---------------------------------------------------
<s> [INST] Write me a reccomendaation for Star Wars: Episode VI - Return of the Jedi directed by Richard Marquand? [/INST] If you're an avid fan of the Star Wars franchise and are looking for another thrilling adventure to add to your collection, then look no

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)

prompt = f"<s> [INST] {movie_info_question} [/INST] "
inputs = tokenizer(prompt, return_tensors="pt").to(0)
outputs = model.generate(**inputs, max_new_tokens=1000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n"+result[len(prompt)-3:])

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



1. Watch the previous two films in the original trilogy, Star Wars: Episode IV - A New Hope and Star Wars: Episode V - The Empire Strikes Back, to fully understand the context and characters of the story.
2. Pay attention to the themes of redemption, sacrifice, and the power of the Force.
3. Watch the film multiple times to catch all the subtle details and references to previous films.
4. Listen to the iconic score by John Williams and appreciate the use of music to enhance the emotional impact of the film.
5. Take note of the impressive special effects and visuals, which were groundbreaking for their time.
6. Enjoy the epic lightsaber battles, the emotional reunions, and the satisfying conclusion to the story arc.
7. Consider the film's impact on popular culture and its influence on science fiction and fantasy genres.
8. Reflect on the lessons learned from the film, such as the importance of friendship, the dangers of power, and the power of good to triumph over evil.


In [None]:
#del(model,model_name,tokenizer,url,url_2,url_3,url_4,url_5,titles_basics,names_of_crew,movies_ratings,movies_crew,movies_basics,scores,sentences)