In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [40]:
# Read the token from the file
token_path = '/content/drive/MyDrive/github_token.txt'
with open(token_path, 'r') as token_file:
    github_token = token_file.read().strip()


# Define the GitHub repository URL with the token for authentication
repo_url = f"https://{github_token}@github.com/nsharma4/llm.git"


# Update the remote URL to include the token
!git -C /content/drive/MyDrive/llm/ remote set-url origin {repo_url}




In [41]:

!pip install -r requirements.txt


/content/drive/MyDrive/llm


In [None]:
!git -C /content/drive/MyDrive/llm/ add .




Reinitialized existing Git repository in /content/drive/.shortcut-targets-by-id/18XarxB4-UrU23qBHLzKLjjeavqd8T6Js/llm/.git/


In [None]:
import os
import pandas as pd
import langchain
import nltk
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.docstore.document import Document
from nltk.corpus import wordnet
import torch

# Ensure NLTK resources are downloaded
nltk.download('wordnet')

# Authenticate with the Kaggle API
api = KaggleApi()



  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Download the "All The News" dataset from Kaggle
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)

# Load and process the dataset
file_path = 'data/Articles.csv'  # Adjust the path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')


Dataset URL: https://www.kaggle.com/datasets/asad1m9a9h6mood/news-articles


In [None]:
# Convert documents into the required format for Chroma
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

# NOTE Truncate document list because it takes too long to process
max_length = 1000
if len(document_list) > max_length:
    document_list = document_list[:max_length]


In [None]:
# Initialize the SentenceTransformer and Chroma vectorstore
# Embed documents using SentenceTransformer

embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2')

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embedding_function)


In [None]:
def expand_query(query):
    expanded_terms = []
    for word in query.split():
        synsets = wordnet.synsets(word)
        if synsets:
            expanded_terms.extend([lemma.name() for lemma in synsets[0].lemmas()])
    return " ".join(set(query.split() + expanded_terms))

def dynamic_retrieval(query, docs, vector_store, k=2):
    expanded_query = expand_query(query)
    new_docs = vector_store.similarity_search(expanded_query, k=k)
    retrieved_docs = {doc.metadata['heading'] for doc in docs}
    unique_new_docs = [doc for doc in new_docs if doc.metadata['heading'] not in retrieved_docs]
    return unique_new_docs


In [None]:
# Set the correct model path
model_path = "/content/drive/MyDrive/llm/Mistral-7B-v0.1"

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# Expand the initial query
query = "Tell me about what the news in Karachi is based on what you know"

# Initial document retrieval using the query
current_docs = vector_store.similarity_search(query, k=2)
context = " ".join([doc.page_content for doc in current_docs])

# Caching tokenized context to avoid re-tokenization
cached_inputs = tokenizer(context, return_tensors="pt")

In [None]:
max_iterations = 3
for i in range(max_iterations):
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            cached_inputs['input_ids'],
            max_new_tokens=40,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            length_penalty=1.5
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Iteration {i+1} generated text:\n{generated_text}\n")

    # Check if more information is needed
    if len(generated_text.split()) < 20 or "need more information" in generated_text.lower():
        new_docs = dynamic_retrieval(generated_text, current_docs, vector_store, k=2)
        if new_docs:
            current_docs += new_docs
            new_context = " ".join([doc.page_content for doc in new_docs])
            context += " " + new_context
            cached_inputs = tokenizer(context, return_tensors="pt")
        else:
            print("No new relevant information found. Ending generation.")
            break
    else:
        print("Sufficient information generated. Ending generation.")
        break

# # Cell 10: Clean up (optional)
# del model
# torch.cuda.empty_cache()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Iteration 1 generated text:
strong>KARACHI/ISLAMABAD: Pakistani stocks are soaring, improved security is fuelling economic growth and the South Asian nation will be upgraded to "emerging market" status by index provider MSCI.</strongBut for Ali Saigol, co-founder of a private equity firm, convincing foreign investors to overlook the country's violent past and bet on its economy remains a hard sell. He said pitches often encountered the same problem: Pakistan's image.After a turbulent decade in which Islamist militants staged frequent gun and bomb attacks on cities, some investors still refuse to visit Pakistan and insist on meeting Saigol in Dubai."Most people have never been to Pakistan. It's a huge problem," said Saigol, sipping a cappuccino in an upscale cafe in Islamabad, Pakistan's leafy, affluent capital."When people visit Pakistan, they see it's actually opposite of what they expect."Saigol's frustration goes to the heart of Pakistan's struggle to attract foreign investment and 