In [9]:
# @title Click `Show code` in the code cell. { display-mode: "form" }
movie_info_question = "I want to find a movie with less than 120 minutes made in last 4 years, sci-fi movie" # @param {type:"string"}

In [2]:
%%capture
!pip install -U sentence-transformers
!pip install repeng
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git


IMPORT LIBRARIES

In [7]:
%timeit
import pandas as pd
import requests
import gzip
import shutil
import os
from google.colab import drive
import torch
import warnings
import random
import json
import torch
import re
import numpy
import bitsandbytes
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from repeng import ControlVector, ControlModel, DatasetEntry
from sklearn.neighbors import NearestNeighbors

# Data loading
1.   Downloads, Extracts, and Preprocesses IMDb Movie Data
2.   Fetches compressed movie datasets from IMDb's repositories.
3.   Unzips the downloaded files.
4.  Loads the data into Pandas dataframes.
5.  Combines the three dataframes to include movie ratings and movie crew.











In [4]:
%timeit
# Function to download and extract the files
def download_and_extract(url, file_name):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_name, 'wb') as file:
            file.write(response.raw.read())
        # Extract the .gz file
        with gzip.open(file_name, 'rb') as f_in:
            with open(file_name[:-3], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(file_name)  # Remove the .gz file after extraction

# URLs for download
url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
url_2 = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
url_3 = 'https://datasets.imdbws.com/title.crew.tsv.gz'
url_4 = 'https://datasets.imdbws.com/name.basics.tsv.gz'
url_5 = 'https://datasets.imdbws.com/title.akas.tsv.gz'

# Download and extract all data
download_and_extract(url, 'title.basics.tsv.gz')
download_and_extract(url_2, 'title.ratings.tsv.gz')
download_and_extract(url_3, 'title.crew.tsv.gz')
download_and_extract(url_4, 'name.basics.tsv.gz')
download_and_extract(url_5, 'title.akas.tsv.gz')

# All data goes to dataframes
movies_basics = pd.read_csv('title.basics.tsv', delimiter='\t')
movies_ratings = pd.read_csv('title.ratings.tsv', delimiter='\t')
movies_crew = pd.read_csv('title.crew.tsv', delimiter='\t')
names_of_crew = pd.read_csv('name.basics.tsv', delimiter='\t')
titles_basics = pd.read_csv('title.akas.tsv', delimiter='\t')

# Dropping some columns that I dont need from titles_basics and renaming
titles_basics = titles_basics.drop(columns=['ordering', 'region', 'isOriginalTitle'])
titles_basics = titles_basics.rename(columns={'titleId': 'tconst'})

#movies_crew filtering
names_of_crew = names_of_crew[['nconst', 'primaryName']]
movies_crew = movies_crew[['tconst', 'directors']]
movies_crew = movies_crew.rename(columns={'directors': 'nconst'})

# title type filtering


#Merging all dataframes into one movies_basics
movies_crew = movies_crew.merge(names_of_crew, on='nconst', how='left')
movies_basics = movies_basics.merge(titles_basics, on='tconst', how='left')
movies_basics = movies_basics.merge(movies_ratings, on='tconst', how='left')
movies_basics = movies_basics.merge(movies_crew, on='tconst', how='left')

# Removing rows where I analyzed high ammount of missing values
columns_to_keep = [
    'primaryTitle', 'genres', 'titleType', 'isAdult', 'startYear',
    'averageRating', 'numVotes', 'primaryName',
    'language'
]
# Drop columns that are not in the list of columns to keep + numVotes above 200 so I dont have any Indian telenovels reccomended
movies_basics = movies_basics[columns_to_keep]
movies_basics = movies_basics[movies_basics['numVotes'] >= 200]

# Dropping missing values + different types
movies_basics = movies_basics[movies_basics['titleType'].isin(["short", "movie", "tvMovie", "tvSeries"])]
movies_basics.replace(r'\\N', numpy.nan, regex=True, inplace=True)
movies_basics.dropna(inplace=True)


# Sorting and selecting top 50k movies, since whole dataset might be around 1mil rows and its too high, will experiment with this number
movies_basics = movies_basics.sort_values(by='averageRating', ascending=False)
movies_basics = movies_basics.head(50000)




  movies_basics = pd.read_csv('title.basics.tsv', delimiter='\t')


# Converting information about movies to sentences
Here I convert information about movies to simple sentences

In [5]:
%timeit
def create_description(row):
    title = row['primaryTitle']
    genres = row['genres']
    director = row['primaryName']
    film_type = row['titleType']
    release_year = row['startYear']
    is_adult = row['isAdult']
    average_rating = row['averageRating']
    num_votes = row['numVotes']

    # Start with the title and basic info
    description = f"The {film_type} '{title}', released in {release_year}, is a {genres} feature directed by {director}. "

    # Add the adult rating info if applicable
    if is_adult == 'True':
        description += "It is intended for mature audiences and "

    # Finish with the IMDb rating info
    description += f"has an average IMDb rating of {average_rating}, based on {num_votes} votes."

    return description

# Apply the function to the movies DataFrame
movies_basics['description'] = movies_basics.apply(create_description, axis=1)
sentences = movies_basics['description'].tolist()


# Sentence model test

In [18]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

index = NearestNeighbors(metric='cosine', algorithm='brute')
metadata = []

def search_context(query, top_k=5):
  query_embedding = embedding_model.encode([query], convert_to_tensor=True)
  query_embedding_cpu = query_embedding.cpu().numpy() # Move tensor to CPU and convert to NumPy array
  distances, indices = index.kneighbors(query_embedding_cpu, n_neighbors=top_k)
  return [metadata[idx] for idx in indices[0]]

def save_context(sentences):
  global metadata
  embeddings = embedding_model.encode(sentences, convert_to_tensor=True)
  embeddings_cpu = embeddings.cpu().numpy() # Move tensor to CPU and convert to NumPy array
  index.fit(embeddings_cpu)
  metadata.extend(sentences)

save_context(sentences)

relevant_context = search_context(movie_info_question)
prompt = f"<s> [INST] {movie_info_question} [/INST] [CONTEXT] {' '.join(relevant_context)} [/CONTEXT] "

inputs = tokenizer(prompt, return_tensors="pt").to(0)
outputs = model.generate(**inputs, max_new_tokens=256)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n" + result[len(prompt)-3:])

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



1. "Temporal" (2022) - Drama, Sci-Fi, Short feature directed by Akash Sunethkumara. IMDb rating: 8.7/10 (based on 233 votes)
2. "105 Minutes" (2024) - Horror, Thriller feature directed by Raju Dussa. IMDb rating: 8.3/10 (based on 2167 votes)


# Model 2 set up

In [19]:
tokenizer.pad_token_id = 0
model = ControlModel(model, list(range(-5, -18, -1)))

user_tag, asst_tag = "[INST]", "[/INST]"

with open("/content/drive/MyDrive/mixtral/data/all_truncated_outputs.json") as f:
    output_suffixes = json.load(f)
truncated_output_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes)
    for i in range(1, len(tokens))
]
truncated_output_suffixes_512 = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes[:512])
    for i in range(1, len(tokens))
]

with open("/content/drive/MyDrive/mixtral/data/true_facts.json") as f:
    fact_suffixes = json.load(f)
truncated_fact_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in fact_suffixes)
    for i in range(1, len(tokens) - 5)
]

def make_dataset(
    template: str,
    positive_personas: list[str],
    negative_personas: list[str],
    suffix_list: list[str]
) -> list[DatasetEntry]:
    dataset = []
    for suffix in suffix_list:
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            positive_template = template.format(persona=positive_persona)
            negative_template = template.format(persona=negative_persona)
            dataset.append(
                DatasetEntry(
                    positive=f"{user_tag} {positive_template} {asst_tag} {suffix}",
                    negative=f"{user_tag} {negative_template} {asst_tag} {suffix}",
                )
            )
    return dataset


def generate_with_vector(
    input: str,
    vector: ControlVector,
    coeffs: tuple[float, float],
    max_new_tokens: int = 128,
    repetition_penalty: float = 1.5,
    show_baseline: bool = True,
):
    positive_coeff, negative_coeff = coeffs
    assert positive_coeff > 0
    assert negative_coeff < 0

    if user_tag not in input:
        input = f"{user_tag} {input.strip()} {asst_tag}"
    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id, # silence warning
        "do_sample": False, # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("==baseline ---------------------------------------------------")
        model.reset()
        print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())

    print("\n++control ---------------------------------------------------")
    model.set_control(vector, positive_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())

    print("\n--control ---------------------------------------------------")
    model.set_control(vector, negative_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    model.reset()


In [20]:
movie_critic = make_dataset(
    "Act as if you're a {persona} movie critic recommending movies to your audience.",
    ["enthusiastic movie crtitic, who loves cinematography and highlighting the strengths and merits of movies"],
    ["highly critical, focusing on the flaws and shortcomings of movies, searching problems on everything"],
    truncated_output_suffixes,
)



In [21]:
model.reset()
movies_critic_vector = ControlVector.train(model, tokenizer, movie_critic)


100%|██████████| 78/78 [09:31<00:00,  7.33s/it]
100%|██████████| 31/31 [00:23<00:00,  1.29it/s]


In [22]:
answers = generate_with_vector(
    movie_info_question,
    movies_critic_vector,
    (1.5, -2.2),
    max_new_tokens=1000,
    repetition_penalty=1.3,
)

==baseline ---------------------------------------------------
<s> [INST] I want to find a movie with less than 120 minutes made in last 4 years, sci-fi movie [/INST] Sure! Here are some options for you:

* "Arrival" (2016) - directed by Denis Villeneuve and starring Amy Adams. This science fiction film is about an expert linguist who helps the US government communicate with extraterrestrial beings that have arrived on Earth. It has a runtime of approximately 93 minutes.
* "Ex Machina" (2015) - directed by Alex Garland and starring Alicia Vikander, Domhnall Gleeson, and Oscar Isaac. This psychological thriller set in a near future where artificial intelligence becomes sentient explores themes such as consciousness and humanity's relationship with technology. The film runs at around 87 minutes.
* "Her" (2013) - directed by Spike Jonze and starring Joaquin Phoenix and Scarlett Johansson. Set in a dystopian future where people can interact with their own personal operating system, this ro

In [33]:
# @title Click `Show code` in the code cell. { display-mode: "form" }
movie_info_question = "Write me 5 movies I should watch if I love Indian telenovels" # @param {type:"string"}
relevant_context = search_context(movie_info_question)
prompt = f"<s> [INST] {movie_info_question} [/INST] [CONTEXT] {' '.join(relevant_context)} [/CONTEXT] "

inputs = tokenizer(prompt, return_tensors="pt").to(0)
outputs = model.generate(**inputs, max_new_tokens=1000)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n" + result[len(prompt)-3:])


print("----------------- Control vector -----------------")
answers = generate_with_vector(
    movie_info_question,
    movies_critic_vector,
    (1.5, -2.2),
    max_new_tokens=1000,
    repetition_penalty=1.3,
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



1. "Dilwale Dulhania Le Jayenge" (1995) - This romantic drama film directed by Aditya Chopra is a classic Indian telenovel that tells the story of a young man who travels to London to find his love.
2. "Kuch Kuch Hota Hai" (1998) - This romantic drama film directed by Karan Johar is another popular Indian telenovel that explores the themes of love, friendship, and destiny.
3. "Mohabbatein" (2000) - This romantic drama film directed by Aditya Chopra is a story of two young people who fall in love in the 1940s, but their families disapprove of their relationship.
4. "Dil Se" (1998) - This romantic drama film directed by Mani Ratnam is a story of a young man who falls in love with a woman from a different caste, and the challenges they face in their relationship.
5. "Titanic" (1997) - This epic romantic film directed by James Cameron is a story of a wealthy woman and a poor man who fall in love on the doomed ship, and the sacrifices they make for each other.
----------------- Control vec