## Setup

In [None]:
pip install -r requirements.txt

In [10]:
from torch import cuda, bfloat16
import transformers

In [None]:
!pip install -qU transformers accelerate einops xformers bitsandbytes sentence_transformers

In [5]:
import langchain
import os
import pickle
import time
import openai
import pandas as pd

In [6]:
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.evaluation.qa import QAEvalChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS, Chroma
import requests
from bs4 import BeautifulSoup
import torch
import numpy as np
from google.colab import userdata

In [7]:
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [8]:
llm = OpenAI(temperature=0.9, max_tokens=500, model = 'text-davinci-003')

In [9]:
openai.api_key = userdata.get('OPENAI_API_KEY')

def ask_openai(question, model="text-davinci-003", temperature=0.9, max_tokens=500):
    response = openai.Completion.create(
        engine=model,
        prompt=question,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].text.strip()

## Load Data

In [12]:
def get_links_from_website(key_word, pages_num):
  links = []
  key_word_plus = key_word.replace(' ', '+')
  key_word_dash = key_word.replace(' ', '-')
  for page_index in range(1, pages_num + 1):
    url = f"https://www.analyticsvidhya.com/page/{page_index}/?s={key_word_plus}"

    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        anchor_tags = soup.find_all('a')

        for tag in anchor_tags:
            href = tag.get('href')
            if href and 'analyticsvidhya.com/blog/' in href and key_word_dash in href:
                links.append(href)
  return links

In [13]:
def get_url_data(urls):
  url_loaders = UnstructuredURLLoader(urls= urls)
  url_data = url_loaders.load()
  return url_data

## Split Data to create chunks

In [14]:
## default separator for recursivecharactertextsplitter - separators=["\n\n", "\n", " ", ""]
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
c_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separator = ' '
)
def split_data(splitter, url_data):
  return splitter.split_documents(url_data)

## Create embeddings for these chunks and save them to FAISS index

In [15]:
open_ai_embeddings = OpenAIEmbeddings()

def get_embeddings(embeddings, url_docs):
  return FAISS.from_documents(url_docs, embeddings)

##  Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [16]:
class FaissRetriever:
    def __init__(self, faiss_index, documents):
        self.faiss_index = faiss_index
        self.documents = documents

    def search(self, query_embedding, k=10):
        # Assuming query_embedding is already computed and is a 1D numpy array
        distances, indices = self.faiss_index.search(np.array([query_embedding]), k)
        return [self.documents[i] for i in indices[0]]

In [18]:
rf_blog_links = get_links_from_website('random forest', 1)
rf_url_data = get_url_data(rf_blog_links)
rf_url_docs = split_data(r_splitter, rf_url_data)
rf_vector_index = get_embeddings(open_ai_embeddings, rf_url_docs)
rf_faiss_retriever = FaissRetriever(rf_vector_index , rf_url_docs)
rf_chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=rf_vector_index.as_retriever())

## Load question dataset

In [18]:
# from google.colab import auth
# auth.authenticate_user()

# from google.auth import default
# import gspread
# import pandas as pd

# creds, _ = default()
# gc = gspread.authorize(creds)


In [19]:
# question_sheet_url = 'https://docs.google.com/spreadsheets/d/1UiDsDcRTrWp9McZTTt-BcS3bhex4h-lRmjXX8ooB6RY/edit#gid=0'
# question_workbook = gc.open_by_url(question_sheet_url)

In [20]:
# question_sheet = question_workbook.worksheet('ML Questions')  # Replace with your sheet name
# question_data = question_sheet.get_all_records()
# question_df = pd.DataFrame(question_data)

In [22]:
question_df = pd.read_csv('ML_interview_questions.csv')

In [21]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=rf_vector_index.as_retriever())

In [30]:
query = "what is Random Forest?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)['answer']

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is Random Forest?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Random Forest is a supervised machine-learning algorithm made up of decision trees.\n\nRandom Forest is used for both classification and regression problems.\n\nFrequently Asked Questions\n\nA. Random Forest is a supervised learning algorithm that works on the concept of bagging. In bagging, a group of models is trained on different subsets of the dataset, and the final output is generated by collating the outputs of all the different models. In the case of random forest, the base model i

' Random Forest is a supervised machine-learning algorithm made up of decision trees used for both classification and regression problems, that is robust to overfitting and able to handle complex relationships between the features and the target variable.\n'

In [85]:
query = "what is the difference between Random Forest and Decision Tree?"


chain({"question": query}, return_only_outputs=True)

{'answer': ' The main difference between Random Forest and Decision Tree is that Random Forest is a bagging method that uses a subset of the original dataset to make predictions and this property of Random Forest helps to overcome Overfitting, while Decision trees suffer from the problem of Overfitting if it’s allowed to grow without any control.\n',
 'sources': ''}

In [None]:
query = "what is the difference between Random Forest and Decision Tree?"


chain({"question": query}, return_only_outputs=True)

In [32]:
query = "What are the advantages of Random Forest?"
langchain.debug = False
chain({"question": query}, return_only_outputs=True)

{'answer': ' The advantages of the Random Forest algorithm are flexibility, better results without hyperparameter tuning, low bias and low variance, and ability to handle unbalanced data and missing values. The disadvantages of Random Forest are not good at generalizing cases with completely new data and biased towards categorical variables with multiple levels.\n',
 'sources': ''}

In [45]:
def evaluate_with_openai(topic, chain):
  q_ls = question_df[question_df['Topic'] == topic]['Question'].to_list()
  q_df = {}
  q_df['questions'] = q_ls
  q_df['predictions'], q_df['real answers'] = [],[]
  for q in q_ls:
    q_df['predictions'].append(chain({"question": q}, return_only_outputs=True)['answer'])
    q_df['real answers'].append(ask_openai(q))
  examples = [{'query': q, 'answer': a} for q, a in zip(q_df['questions'], q_df['real answers'])]

  predictions = [{'query': q, 'answer': a, 'result': p} for q, a, p in zip(q_df['questions'], q_df['real answers'], q_df['predictions'])]
  eval_chain = QAEvalChain.from_llm(llm)
  graded_outputs = eval_chain.evaluate(examples, predictions)
  return q_df
  print(graded_outputs)

In [46]:
def __main__(topic, pages_num):
  blog_links = get_links_from_website(topic, 1)
  url_data = get_url_data(blog_links)
  url_docs = split_data(r_splitter, url_data)
  vector_index = get_embeddings(open_ai_embeddings, url_docs)
  faiss_retriever = FaissRetriever(vector_index , url_docs)
  chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_index.as_retriever())
  result_df = evaluate_with_openai(topic, chain)
  return result_df