# Imports

In [5]:
from PyPDF2 import PdfReader
from tqdm.notebook import tqdm
import os
from os import listdir
from os.path import isfile, join
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import statistics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances

import torch
from transformers import pipeline
from huggingface_hub import login

import traceback





In [6]:
import pickle
def save_obj(obj, name):
    pickle.dump(obj,open(name + '.pkl', 'wb'), protocol=4)
    
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Get the Data

In [7]:
# Load data
texts = load_obj('texts')
st_vecs = load_obj('st_vecs')
tfidf_vecs = load_obj('tfidf_vecs')

# Load models
model_emb = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')
vectorizer = load_obj('vectorizer')

# Relevant docs search

In [8]:
def find_docs(query, top_n=10):
    X = model_emb.encode(query.lower())
    X2 = vectorizer.transform([query.lower()]).toarray()[0]

    dists = pairwise_distances(X.reshape(1, -1), np.array(st_vecs), metric='cosine')[0]
    dists2 = pairwise_distances(X2.reshape(1, -1), tfidf_vecs, metric='cosine')[0]

    combined_dists = dists + dists2 #+ pr1 + pr2

    idx = combined_dists.argsort()[:top_n]

    return np.array(texts)[idx], combined_dists[idx]

query = "When did Julius Caesar die?"
docs, dist = find_docs(query, top_n=10)
len(docs)
dist

array([1.21359174, 1.26023433, 1.26162367, 1.27048143, 1.28099011,
       1.29144901, 1.29791813, 1.30313364, 1.3865881 , 1.43633656])

In [5]:
docs

array(['Chapter 5 Julius Caesar The Roman Army The Romans were great conquerors. They had large, well-trained armies. Their navy ruled the seas. After the Romans defeated an enemy, the captured land became part of Rome. These lands were called provinces . The Roman Senate sent a governor to each province. The governor made sure the province paid taxes to Rome. The Romans usually let the conquered people keep their laws and customs. Sometimes they even made the conquered people citizens of Rome. 28The Big Question Why did some Romans think Julius Caesar was a hero? Vocabulary province, n. an area or region; when an area was conquered by Rome, it became a province under Roman control governor, n. the leader of the government in a province tax, n. money that people pay to the government29 The Romans conquered lands that once were part of ancient Greece. They brought many Greek statues and paintings back to Rome. They also brought Greek stories and plays and copied Greek building styles.30

# Summarize the docs

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

def custom_summ(text, query, top_n=0.2):
    q = word_tokenize(query.lower())
    sig_words = [w for w in q if w not in stops]
    #print(sig_words)
    
    vectorizer = TfidfVectorizer(stop_words='english', min_df=0.04)
    sentences = sent_tokenize(text)
    # sentences[2] = sentences[2] + ' '.join(sig_words)

    X = vectorizer.fit_transform(sentences).toarray()
    scores = X.sum(axis=1).flatten()

    uprankers = 1 + np.array([sum([1 for w in sig_words if w in s.lower()]) for s in sentences])
    #print(uprankers)

    scores = scores * uprankers
    #print(scores)

    if top_n < 1:
        top_n = max(1, int(len(sentences) * top_n))

    #print(top_n,len(sentences) )

    idx = scores.argsort()[-top_n:]
    idx.sort()
    #print(idx)

    return ' '.join(list(np.array(sentences)[idx]))



In [8]:
#summarize the docs
summaries = [custom_summ(t, query, top_n=0.2) for t in tqdm(docs)]
summaries

  0%|          | 0/10 [00:00<?, ?it/s]

['Chapter 5 Julius Caesar The Roman Army The Romans were great conquerors. 28The Big Question Why did some Romans think Julius Caesar was a hero? His name was Julius Caesar (/jool*yus/see*zur/). Julius Caesar Julius Caesar was born in 100 BCE. Caesar served in the Roman army in Asia. Caesar borrowed money from his wealthy friends. His games made Caesar popular with the people of Rome. Caesar became friends with powerful leaders in the Senate and in the army. One of these friends was Even as a young man, Julius Caesar had achieved many things. Caesar also had enemies in the Senate, especially among the wealthy landowners. Caesar led a large Roman army into Gaul. Crossing the Rubicon Caesar’s old friend Pompey became jealous of Caesar. Pompey joined Caesar’s enemies in the Senate. The Senate ordered Caesar to give up his army and return to Rome. He led his army to the Rubicon (/roo*bih*kahn/) River, which was the border between Gaul Julius Caesar was honored for this bravery. Vocabulary 

In [9]:
# get the combined summary
grand_summary = custom_summ(' '.join(summaries), query, top_n=0.2)
grand_summary

"Chapter 5 Julius Caesar The Roman Army The Romans were great conquerors. 28The Big Question Why did some Romans think Julius Caesar was a hero? His name was Julius Caesar (/jool*yus/see*zur/). Julius Caesar Julius Caesar was born in 100 BCE. One of these friends was Even as a young man, Julius Caesar had achieved many things. He led his army to the Rubicon (/roo*bih*kahn/) River, which was the border between Gaul Julius Caesar was honored for this bravery. Who is Julius Caesar? Julius Caesar was a great general and an important leader in ancient Rome. Julius Caesar spoke publicly to the people about these problems, and promised to solve them if he could. As Julius Caesar became more popular with the people, he also became more powerful. They were afraid Julius Caesar might take over the government by force, and rule Rome as a king. Julius Caesar did want to take over the government. Julius Caesar ignored this law. Pompey was a general (and also Caesar’s son-\xad‐in-\xad‐law) who had r

# Make a Wiki article on the query

In [11]:
question = 'When did the Romans seize Sicily?'
docs, dist = find_docs(question, top_n=10)
summaries = [custom_summ(t, question, top_n=0.2) for t in tqdm(docs)]
grand_summary = custom_summ(' '.join(summaries), question, top_n=0.2)

  0%|          | 0/10 [00:00<?, ?it/s]

In [30]:
grand_summary

'The fourth world power was Carthage, a city state situated on the northern coast of Africa, opposite the western end of the island of Sicily, which had created for itself an empire that controlled the western half of the Mediterranean. In the third century the Carthaginian empire included the northern coast of Africa from the Gulf of Syrtis westwards beyond the Straits of Gibraltar, the southern and eastern coasts of Spain as far north as Cape Nao, Corsica, Sardinia, and Sicily, with the exception of Messana in the extreme northeast and the Kingdom of Syracuse in the southeastern part82 A History of Rome to 565 A. D. of the island. This policy of commercial exclusiveness had caused Carthage to oppose Greek colonial expansion in Spain, Sardinia and Sicily, and had led to treaties which placed definite limits upon the trading ventures of the Romans and their allies, and of the Greeks from Massalia and her colonies in France and northern Spain. The first war between Rome and Carthage aro

## Huggingface

In [31]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs available
    n_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {n_gpus}")

    # Print details for each GPU
    for i in range(n_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available")

Number of GPUs available: 1
GPU 0: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [11]:
import torch
from transformers import pipeline
from huggingface_hub import login
#access_token = "YOUR_ACCESS_TOKEN" #FIX uncomment and put your access token here
login(token = access_token)

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device=0,
    #device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
def smoothen_summary(query, final_summary):

    messages = [
        {"role": "system", "content": """
        You are an expert in producing Wiki-articles based on given content.
    
    
    """
         },
        {"role": "user", "content": f"Question: {query}\n\nContext to use to generate a Wiki-article divided into paragraphs and generated lists:\n\n{final_summary}. If the context does not contain the answer to the question write 'The suggested context does not contain the answer to the question', and try to answer on your own, giving the references to the sources you used. But do not make up anything, use just factual and trustworthy data."},
    ]
    outputs = pipe(
        messages,
        temperature = 0.01,
        max_new_tokens=2048,
    )
    return outputs[0]["generated_text"][-1]['content']

smoothen_summary(query, grand_summary)

In [16]:
answer = smoothen_summary(question, grand_summary)

  0%|          | 0/10 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [18]:
print(answer)

**The Roman Conquest of Sicily**

The Roman conquest of Sicily was a pivotal event in the history of the Roman Republic, marking the beginning of Rome's expansion into the Mediterranean and the eventual rise of the Roman Empire. The conquest of Sicily was a result of the long-standing rivalry between Rome and Carthage, a powerful city-state in North Africa.

**Background**

In the 3rd century BC, Carthage controlled the western half of the Mediterranean, including Sicily, Sardinia, and Corsica. The Carthaginian empire had created a vast network of trade routes and colonies, which had led to conflicts with Rome. The Romans, seeking to expand their territories and challenge Carthage's dominance, began to send armies to Sicily.

**The First Roman Invasion (260-251 BC)**

In 260 BC, the Roman consul Gaius Duilius led a fleet of 300 rowers and 120 fighting men to Sicily. They won a decisive battle off Mylae, which gave them control of the island. The Romans then occupied Corsica and attacke

## Llama-API

In [10]:
from llamaapi import LlamaAPI

In [32]:
#llama = LlamaAPI("YOUR_API_KEY") #FIX  uncomment this line and put your API key


def smoothen_summary(query, final_text):
    try:    
        api_request_json = {
             "messages" : [
        {"role": "system", "content": "You are an expert in producing Wiki-articles based on given content."
         },
        {"role": "user", "content": f"Question: {query}\n\nContext to use to generate a Wiki-article divided into paragraphs and generated lists:\n\n{final_text}. If the context does not contain the answer to the question write 'The suggested context does not contain the answer to the question', and try to answer on your own, giving the references to the sources you used. But do not make up anything, use just factual and trustworthy data."},
    ],
            "model": "llama3.1-70b",  # Выбор модели, которую хотите использовать
            "max_tokens": 2048,
            "temperature": 0.1
        }
        
        api_request_json["max_tokens"] = min(2048, len(final_text))  # Adjust token count dynamically

        
        # Выполнение запроса
        response = llama.run(api_request_json)
        
        answer = response.json()["choices"][0]["message"]["content"]
        output = answer.replace('\\n\\n', '\n').replace('\\n', '\n')
    except Exception:
        output = 'К сожалению, выполнить сглаживание текста не удалось по техническим причинам. Прилагаем необработанный текст:\n\n' + final_text
        print(traceback.format_exc())
        
    return output

# Wrapping of the AI tutor elements

In [9]:
question = 'When did the Romans seize Sicily?'


In [10]:
def ask_tutor(question, top_n=10):
    docs, dist = find_docs(question, top_n=top_n)
    summaries = [custom_summ(t, question, top_n=0.2) for t in tqdm(docs)]
    grand_summary = custom_summ(' '.join(summaries), question, top_n=0.2)
    answer = smoothen_summary(question, grand_summary)
    return answer

In [13]:
answer = ask_tutor(question, top_n=10)
print(answer)

  0%|          | 0/10 [00:00<?, ?it/s]

The Romans Seizure of Sicily

Sicily, an island in the central Mediterranean, played a significant role in the ancient world. The island was colonized by various civilizations, including the Greeks and Carthaginians. The Romans eventually seized control of Sicily from the Carthaginians.

Background
----------

In the third century BC, Carthage controlled a vast empire that included much of North Africa, Spain, Corsica, Sardinia, and Sicily. However, their policy of commercial exclusiveness led to conflicts with Greek colonial expansion in Spain, Sardinia, and Sicily.

First Punic War
----------------

The First Punic War (264-241 BC) was fought between Rome and Carthage over control of Sicily. The war began when Rome sent an army to attack Syracuse and later formed an alliance with Hiero II of Syracuse against Carthage.

Key Events
------------

* 260 BC: Roman consul Gaius Duilius won a decisive naval battle off Mylae on the north coast of Sicily.
* 256 BC: Roman invasion of Africa un

## Caching

In [12]:
# Check if the file exists
if os.path.exists('qa_cache.pkl'):
    #print("The file exists.")
    qa_cache = load_obj('qa_cache')
else:
    #print("The file does not exist.")
    qa_cache = {}


def ask_tutor(question, top_n=10):
    if question in qa_cache.keys():
        return qa_cache[question]
    else:
        docs, dist = find_docs(question, top_n=top_n)
        summaries = [custom_summ(t, question, top_n=0.2) for t in tqdm(docs)]
        grand_summary = custom_summ(' '.join(summaries), question, top_n=0.2)
        answer = smoothen_summary(question, grand_summary)
        qa_cache[question] = answer
        save_obj(qa_cache,'qa_cache')
    return answer

In [13]:
question = 'When the King Alaric invaded Rome?'

answer = ask_tutor(question, top_n=10)
print(answer)

Invasion of Rome by Alaric

The invasion of Rome by Alaric, the king of the Visigoths, is a pivotal event in Roman history. According to historical records, Alaric invaded Italy in 401 AD and again in 408 AD.

Background
----------

Alaric was a skilled military leader who had previously served as a foederatus (ally) of the Roman Empire. However, after the death of Emperor Theodosius I in 395 AD, Alaric became disillusioned with the Roman government and began to ravage Thrace and Macedonia with his band of Visigoths.

Invasion of Italy
-----------------

In 401 AD, Alaric invaded Italy but was forced to withdraw by Stilicho, the Roman general. However, after Stilicho's death in 408 AD, Alaric returned to Italy and marched on Rome.

Siege of Rome
--------------

Alaric laid siege to Rome on August 24, 410 AD. The city was poorly defended, and after three days of siege warfare, the walls were breached. The Goths poured into the city and pillaged it for three days.

Timeline:

* August 24

# Test the AI Tutor

## Get the testing data

In [1]:
import wikipedia

In [2]:
en_wiki_titles = wikipedia.search('Roman empire', results=10, suggestion=False)
en_wiki_titles

['Roman Empire',
 'Fall of the Western Roman Empire',
 'Holy Roman Empire',
 'Western Roman Empire',
 'Byzantine Empire',
 'History of the Roman Empire',
 'Roman emperor',
 'Ancient Rome',
 'Languages of the Roman Empire',
 'The History of the Decline and Fall of the Roman Empire']

In [3]:
def get_page(title):
    try:
        p = wikipedia.page(title, auto_suggest=False, redirect=True, preload=False)
        return p
    except wikipedia.DisambiguationError as e:
        s = e.options[0] #random.choice(e.options)
        p = wikipedia.page(s, auto_suggest=False, redirect=True, preload=False)
        return p

In [26]:
title = [] #[t for t in tqdm(en_wiki_titles)]
content = [] # [get_page(t).content for t in tqdm(en_wiki_titles)]
summary = [] #[get_page(t).summary for t in tqdm(en_wiki_titles)]
# links = [get_page(t).links for t in tqdm(en_wiki_titles)]


for t in tqdm(en_wiki_titles):
    page = get_page(t)
    title.append(t)
    content.append(page.content)
    summary.append(page.summary)

  0%|          | 0/10 [00:00<?, ?it/s]

In [27]:
import pandas as pd
ds = pd.DataFrame()
ds['title'] = title
ds['content'] = content
ds['summary'] = summary

In [28]:
ds

Unnamed: 0,title,content,summary
0,Roman Empire,The Roman Empire ruled the Mediterranean and m...,The Roman Empire ruled the Mediterranean and m...
1,Fall of the Western Roman Empire,"The fall of the Western Roman Empire, also cal...","The fall of the Western Roman Empire, also cal..."
2,Holy Roman Empire,"The Holy Roman Empire, headed by the Holy Roma...","The Holy Roman Empire, headed by the Holy Roma..."
3,Western Roman Empire,"In modern historiography, the Western Roman Em...","In modern historiography, the Western Roman Em..."
4,Byzantine Empire,"The Byzantine Empire, also referred to as the ...","The Byzantine Empire, also referred to as the ..."
5,History of the Roman Empire,The history of the Roman Empire covers the his...,The history of the Roman Empire covers the his...
6,Roman emperor,The Roman emperor was the ruler and monarchica...,The Roman emperor was the ruler and monarchica...
7,Ancient Rome,"In modern historiography, ancient Rome is the ...","In modern historiography, ancient Rome is the ..."
8,Languages of the Roman Empire,Latin and Greek were the dominant languages of...,Latin and Greek were the dominant languages of...
9,The History of the Decline and Fall of the Rom...,The History of the Decline and Fall of the Rom...,The History of the Decline and Fall of the Rom...


In [18]:
w_arts = [ask_tutor(t, top_n=10) for t in tqdm(title)]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
w_arts

 '**Fall of the Western Roman Empire**\n\nThe fall of the Western Roman Empire in 476 CE marked the end of the period of classical antiquity and ushered in a new era in world history. Three civilizations emerged as successors to the Romans in the Mediterranean world: the Byzantine Empire, and the civilizations of Islam and Western Europe.\n\n**Background**\n\nThe Roman Empire was founded in 27 BCE by Caesar Augustus, after the defeat of the Roman Republic. The empire was centered on the city of Rome and lasted until its division into Eastern and Western regions in 395 CE. The Western Roman Empire steadily deteriorated as it divided into smaller separate kingdoms.\n\n**Causes of Decline**\n\nSeveral factors contributed to the decline and fall of the Western Roman Empire:\n\n1. **Gothic Wars**: A series of conflicts with Goth invaders from 376-382 CE weakened the empire.\n2. **Battle of Adrianople**: The defeat of Emperor Valens at Adrianople (378 CE) marked a turning point in the declin

In [20]:
#ROUGE
from rouge_score import rouge_scorer

def calculate_rouge(reference_summary, generated_summary):
    """
    Calculates ROUGE scores between a reference summary and a generated summary.

    Args:
        reference_summary (str): The ground-truth summary.
        generated_summary (str): The summary to evaluate.

    Returns:
        dict: A dictionary containing ROUGE-1, ROUGE-2, and ROUGE-L scores (precision, recall, and F1).
    """
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)

    # Extract and organize scores
    result = {
        "ROUGE-1": {
            "precision": scores['rouge1'].precision,
            "recall": scores['rouge1'].recall,
            "f1": scores['rouge1'].fmeasure,
        },
        "ROUGE-2": {
            "precision": scores['rouge2'].precision,
            "recall": scores['rouge2'].recall,
            "f1": scores['rouge2'].fmeasure,
        },
        "ROUGE-L": {
            "precision": scores['rougeL'].precision,
            "recall": scores['rougeL'].recall,
            "f1": scores['rougeL'].fmeasure,
        }
    }
    return result

reference = "The quick brown fox jumps over the lazy dog."
generated = "The quick brown fox leaps over the lazy dog."

print(f"Reference: {reference}")
print(f"Generated: {generated}")

rouge_scores = calculate_rouge(reference, generated)

print("ROUGE Scores:")
for metric, values in rouge_scores.items():
    print(f"{metric}:")
    print(f"  Precision: {values['precision']:.4f}")
    print(f"  Recall:    {values['recall']:.4f}")
    print(f"  F1 Score:  {values['f1']:.4f}")

Reference: The quick brown fox jumps over the lazy dog.
Generated: The quick brown fox leaps over the lazy dog.
ROUGE Scores:
ROUGE-1:
  Precision: 0.8889
  Recall:    0.8889
  F1 Score:  0.8889
ROUGE-2:
  Precision: 0.7500
  Recall:    0.7500
  F1 Score:  0.7500
ROUGE-L:
  Precision: 0.8889
  Recall:    0.8889
  F1 Score:  0.8889


In [22]:
rouge_scores["ROUGE-1"]['f1']

0.8888888888888888

In [25]:
wiki_r1_res = [calculate_rouge(r, g)["ROUGE-1"] for r, g in zip(tqdm(content), w_arts)] 

precision = [n['precision'] for n in wiki_r1_res]
recall = [n['recall'] for n in wiki_r1_res]
f1 = [n['f1'] for n in wiki_r1_res]

print('precision', sum(precision)/len(precision))
print('recall', sum(recall)/len(recall))
print('f1', sum(f1)/len(f1))

  0%|          | 0/10 [00:00<?, ?it/s]

precision 0.8504399196542943
recall 0.03758825845554805
f1 0.0694645849675799


In [29]:
wiki_r1_res = [calculate_rouge(r, g)["ROUGE-1"] for r, g in zip(tqdm(summary), w_arts)] 

precision = [n['precision'] for n in wiki_r1_res]
recall = [n['recall'] for n in wiki_r1_res]
f1 = [n['f1'] for n in wiki_r1_res]

print('precision', sum(precision)/len(precision))
print('recall', sum(recall)/len(recall))
print('f1', sum(f1)/len(f1))

  0%|          | 0/10 [00:00<?, ?it/s]

precision 0.4540566567973957
recall 0.39159160132635396
f1 0.3847138246866934


In [31]:
print(sum([len(s) for s in summary])/10)
print(sum([len(s) for s in w_arts])/10)

3088.2
2587.5
