# **Installation and Imports**

In [None]:
!pip install sentence_transformers
!pip install requests
!pip install chromadb

In [None]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from huggingface_hub import InferenceClient
import numpy as np
import os
import requests
from google.colab import files
import re
from xml.etree import ElementTree as ET
import html
import requests

# **File Upload and Reading**

In [None]:
uploaded = files.upload()

Saving 110_97723_133067_million_dollar_baby.xml to 110_97723_133067_million_dollar_baby (1).xml


In [None]:
file_path = '/content/110_97723_133067_million_dollar_baby.xml'

In [None]:
with open(file_path, 'r') as file:
    # Read the contents of the file
    text = file.read()

# **Text Preprocessing**

In [None]:
def preprocess_subtitles(text):
    ids = []  # List to hold ids
    texts = []  # List to hold texts
    # Find <s> tags
    matches = re.findall(r'<s id="(\d+)">(.*?)<\/s>', text, re.DOTALL)
    for match in matches:
        s_id, s_content = match
        # Decode HTML entities
        s_text = html.unescape(s_content)
        # Remove <w> tags and directly capture their content, and remove other tags
        s_text = re.sub(r'<w id=".*?">(.*?)<\/w>', r'\1', s_text)
        s_text = re.sub(r'<.*?>', '', s_text)
        # Clean-up specific unwanted patterns like encoded <i>, [ ], and any leftovers
        s_text = s_text.replace('<i>[', '').replace(']</i>', '').replace('</ i>', '')
        # Normalize spaces
        s_text = ' '.join(s_text.split())
        # Append processed id and text to their respective lists
        ids.append(s_id)
        texts.append(s_text)
    return ids, texts

In [None]:
ids_list, preprocessed_subtitles = preprocess_subtitles(text)
print(preprocessed_subtitles[1])
print(ids_list[1])

I' ve only ever met one man i wouldn' t want to fight when i met him he was already the best cut man in the business started training and managing in the 60' s , never lost his gift .
2


# **Text Embedding and Retrieval System**

In [None]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="subtitles_million")

In [None]:
sentence_ids = list(map(str, preprocessed_subtitles))

In [None]:
collection.add(
    documents= sentence_ids,
    ids=ids_list
)

In [None]:
def text_embedding(text) -> None:
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return model.encode(text)

In [None]:
def generate_context(query):
    vector=text_embedding(query).tolist()

    results=collection.query(
        query_embeddings=vector,
        n_results=400,
        include=["documents"]
    )

    res = "\n".join(str(item) for item in results['documents'][0])
    return res

In [None]:
query="What should I know about this movie before recommending it to my 10 years old kid"
context=generate_context(query)

In [None]:
context

"Wants to know what your reading .\nYou watch it ?\nLittle girl tends to be coming along .\nWhat' s this ?\nYou already taught me everything I need to know .\nIf i' m too old for this then i got nothing . that enough truth to see to you ?\nSally' s trying to set a fight with julia ?\nIt' s beautiful man , it' s like poetry .\nHeard about Willie , it' s cold , it' s dead cold , plus it wouldn' t be so bad if you weren' t so damn old .\nDangerous showed up a couple years back , it comes to everything to LA with errilville , ... new boyfriend , apparently errilville got lost , and ended up back in Texas .\nWhat is it ?\nNothing .\nYou wont never regret it .\nNo take the bag , haven' t seen it in 20yrs anyway .\nYeah I watched it .\nGracey broke down crying when she saw it .\nI' m trying to read here .\nWell i' d be tempted but , i couldn' t say for sure , might find it' s way to the track .\nSugar ray would do that . Girls got sugar .\nWell maggie fitzgerald , what' s up ?\nWhat ya readin

In [None]:
user_prompt=f"""
Based on the context:
{context}
Answer the below query:
{query}
"""

In [None]:
user_prompt

"\nBased on the context:\nWants to know what your reading .\nYou watch it ?\nLittle girl tends to be coming along .\nWhat' s this ?\nYou already taught me everything I need to know .\nIf i' m too old for this then i got nothing . that enough truth to see to you ?\nSally' s trying to set a fight with julia ?\nIt' s beautiful man , it' s like poetry .\nHeard about Willie , it' s cold , it' s dead cold , plus it wouldn' t be so bad if you weren' t so damn old .\nDangerous showed up a couple years back , it comes to everything to LA with errilville , ... new boyfriend , apparently errilville got lost , and ended up back in Texas .\nWhat is it ?\nNothing .\nYou wont never regret it .\nNo take the bag , haven' t seen it in 20yrs anyway .\nYeah I watched it .\nGracey broke down crying when she saw it .\nI' m trying to read here .\nWell i' d be tempted but , i couldn' t say for sure , might find it' s way to the track .\nSugar ray would do that . Girls got sugar .\nWell maggie fitzgerald , wha

In [None]:
system_prompt = "Before deciding whether a movie is appropriate for young audiences, such as children around 10 years old, or for family viewing, I need to assess its content meticulously. This evaluation requires a thorough analysis of the provided subtitles to identify specific content aspects. Please assist by addressing the following points based on the subtitles: 1. Violence: Identify instances of violence within the movie. Describe the context and nature of these scenes, and provide excerpts from the subtitles that illustrate violent content. 2. Scariness: Point out scenes that could be considered scary or intense for younger viewers. Provide relevant lines from the subtitles to help gauge the level of scariness. 3. Positive Role Models: Analyze characters or scenarios depicted in the subtitles that could serve as positive examples for young viewers. Highlight how these elements may offer inspirational or educational value. 4. Overall Messages: Summarize the core messages or themes conveyed through the movie, with an emphasis on positive and constructive themes such as friendship, courage, and ethical decision-making. Additionally, if the movie contains any language, scenes, or themes that might be deemed inappropriate or questionable by parents, please note these with specific examples from the subtitles provide atleast 3 examples of subtitles per genre. The goal is to compile a comprehensive overview to assist in determining the suitability of the movie for a young audience or family viewing."


In [None]:
system_prompt

'Before deciding whether a movie is appropriate for young audiences, such as children around 10 years old, or for family viewing, I need to assess its content meticulously. This evaluation requires a thorough analysis of the provided subtitles to identify specific content aspects. Please assist by addressing the following points based on the subtitles: 1. Violence: Identify instances of violence within the movie. Describe the context and nature of these scenes, and provide excerpts from the subtitles that illustrate violent content. 2. Scariness: Point out scenes that could be considered scary or intense for younger viewers. Provide relevant lines from the subtitles to help gauge the level of scariness. 3. Positive Role Models: Analyze characters or scenarios depicted in the subtitles that could serve as positive examples for young viewers. Highlight how these elements may offer inspirational or educational value. 4. Overall Messages: Summarize the core messages or themes conveyed thro

# **Generative Model**

In [None]:
api_key = "put your api key"


In [None]:
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Initial context setup for detailed responses
messages = [
    {"role": "system", "content":system_prompt},
    {"role": "user", "content": user_prompt}
]

# Example of a follow-up for more details
follow_up = {"role": "user", "content": "Could you go into more detail about the sentences choosen"}

# Including the follow-up in the messages
messages.append(follow_up)


data = {
    "model": "gpt-3.5-turbo",
    "messages": messages,
}


response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
response_data = response.json()


In [None]:
if response_data["choices"]:
    content = response_data["choices"][0]["message"]["content"]
    # Splitting the content by sections for better readability
    sections = content.split("\n\n")
    for section in sections:
        print(section.strip())

else:
    print("No content available.")

Certainly! Let's delve into the subtitles provided to identify instances that may impact younger audiences:
1. **Violence:**
   - "*You gotta move your feet, that's one of the best things I'm gonna be able to teach ya.*"
   - "*Then I might as well just keep calling you it.*"
   - "*You through a punch you get the hell out of this gym.*"
These excerpts contain references to physical actions that suggest potential violence or aggression, which may not be suitable for young viewers, especially if portrayed in a realistic or intense manner.
2. **Scariness:**
   - "*She keeps hitting it like that she gonna break her wrists.*"
   - "*Didn't take Maggie long to hit a strive.*"
   - "*He's hurting me, making my heart watching him punch the air like it's gonna punch back.*"
These lines could be perceived as intense or potentially scary, especially in the context of physical harm or injury, which might be distressing for younger viewers.
3. **Positive Role Models:**
   - "*People say the most i

In [None]:
messages

[{'role': 'system',
  'content': 'Before deciding whether a movie is appropriate for young audiences, such as children around 10 years old, or for family viewing, I need to assess its content meticulously. This evaluation requires a thorough analysis of the provided subtitles to identify specific content aspects. Please assist by addressing the following points based on the subtitles: 1. Violence: Identify instances of violence within the movie. Describe the context and nature of these scenes, and provide excerpts from the subtitles that illustrate violent content. 2. Scariness: Point out scenes that could be considered scary or intense for younger viewers. Provide relevant lines from the subtitles to help gauge the level of scariness. 3. Positive Role Models: Analyze characters or scenarios depicted in the subtitles that could serve as positive examples for young viewers. Highlight how these elements may offer inspirational or educational value. 4. Overall Messages: Summarize the core