In [None]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
!pip install py-llm-core
!pip install openai==1.45.0

Collecting py-llm-core
  Downloading py_llm_core-3.4.7-py3-none-any.whl.metadata (13 kB)
Collecting openai>=1.37 (from py-llm-core)
  Downloading openai-1.47.0-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken (from py-llm-core)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dirtyjson (from py-llm-core)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting llama-cpp-python>=0.2.84 (from py-llm-core)
  Downloading llama_cpp_python-0.2.90.tar.gz (63.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting python-decouple (from py-llm-core)
  Downloading python_decouple-3.8-py3-none-any.w

In [None]:
# Import necessary libraries and modules
import pandas as pd
import unicodedata
from llm_core.splitters import TokenSplitter
from typing import List
from dataclasses import dataclass
from llm_core.assistants import OpenAIAssistant
import os
from google.colab import userdata
import zipfile
import openai
from openai import OpenAI

In [None]:
# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = userdata.get('AIProject')

client = OpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),
)

In [None]:
def cleanup_unicode(text):
    """
    Normalize Unicode characters in a given text to a standard form.

    This function takes a string input and processes each character to normalize
    it using Unicode normalization form KC (NFKC). NFKC stands for Normalization
    Form KC (Compatibility Composition), which ensures that characters are
    decomposed and then recomposed to their canonical composed form, replacing
    compatibility characters with their standard equivalents.

    Parameters:
    ----------
    text : str
        The input text that contains Unicode characters to be normalized.

    Returns:
    -------
    str
        The normalized text where all characters are converted to their
        compatibility composed form.
    """
    corrected_chars = []
    for char in text:
        corrected_char = unicodedata.normalize("NFKC", char)
        corrected_chars.append(corrected_char)

    return "".join(corrected_chars)


In [None]:
def t_splitter(text):
    """
    Splits a given text into chunks of a specified token size.

    This function uses the `TokenSplitter` class to divide the input text into
    chunks of up to 6,000 tokens, without any overlap between chunks. It returns
    the first chunk of text.

    Parameters:
    ----------
    text : str
        The input text that needs to be split into smaller chunks.

    Returns:
    -------
    str
        The first chunk of text, limited to 6,000 tokens.
    """
    splitter = TokenSplitter(chunk_size=6_000, chunk_overlap=0)
    text_result = next(splitter.chunkify(text))

    return text_result


In [None]:
@dataclass
class DenseSummary:
    denser_summary: str
    missing_entities: List[str]


@dataclass
class DenserSummaryCollection:
  system_prompt = """
  You are an expert in writing rich and dense summaries in broad domains.
  """

  prompt = """
  Article:

  {article}

  ----

  You will generate increasingly concise, entity-dense summaries of the above
  Article.

  Repeat the following 2 steps 5 times.

  - Step 1: Identify 1-3 informative Entities from the Article
  which are missing from the previously generated summary and are the most
  relevant.

  - Step 2: Write a new, denser summary of identical length which covers
  every entity and detail from the previous summary plus the missing entities

  A Missing Entity is:

  - Relevant: to the main story
  - Specific: descriptive yet concise (5 words or fewer)
  - Novel: not in the previous summary
  - Faithful: present in the Article
  - Anywhere: located anywhere in the Article

  Guidelines:
  - The first summary should be long (4-5 sentences, approx. 80 words) yet
  highly non-specific, containing little information beyond the entities
  marked as missing.

  - Use overly verbose language and fillers (e.g. "this article discusses") to
  reach approx. 80 words.

  - Make every word count: re-write the previous summary to improve flow and
  make space for additional entities.

  - Make space with fusion, compression, and removal of uninformative phrases
  like "the article discusses"

  - The summaries should become highly dense and concise yet self-contained,
  e.g., easily understood without the Article.

  - Missing entities can appear anywhere in the new summary.

  - Never drop entities from the previous summary. If space cannot be made,
  add fewer new entities.

  > Remember to use the exact same number of words for each summary.
  Answer in JSON.

  > The JSON in `summaries_per_step` should be a list (length 5) of
  dictionaries whose keys are "missing_entities" and "denser_summary".

  """

  summaries: List[DenseSummary]


  @classmethod
  def summarize(cls, article):
      with OpenAIAssistant(cls, model='gpt-4') as assistant:
          return assistant.process(article=article)



def generate_summary(input_text, max_tokens=50):
    system = [{"role": "system", "content": "You are Summary AI."}]
    user = [{"role": "user", "content": f"Summarize this briefly:\n\n{input_text}"}]

    chat_completion = client.chat.completions.create(
    messages = system + user,
    model="gpt-4",
    max_tokens=500, top_p=0.9,
    )
    return chat_completion.choices[0].message.content

In [None]:
zip_file_path = '/content/Data.zip'

texts = []

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    for file_info in zip_ref.infolist():
        if file_info.filename.endswith('.txt'):
            with zip_ref.open(file_info) as file:
                text = file.read().decode('utf-8')
                cleaned_text = cleanup_unicode(text)
                texts.append(cleaned_text)

data = pd.DataFrame(texts, columns=['Text'])

data

Unnamed: 0,Text
0,Компас — найпростіший і найдавніший навігаційн...
1,Винахід колеса є найбільшим досягненням в обла...
2,Cоціальні мережі стали невід’ємною частиною су...
3,Синдром відмінника у дорослих часто проявляєть...
4,"Сьогодні кожна людина знає, що таке «Формула 1..."
5,На початку ХІХ століття сім'я Беннет живе в св...
6,Події відбуваються наприкінці вісімнадцятого т...


In [None]:
# Apply the t_splitter function to the 'Text' column of the DataFrame
data['Text_Splitted'] = data['Text'].apply(t_splitter)
data

Unnamed: 0,Text,Text_Splitted
0,Компас — найпростіший і найдавніший навігаційн...,Компас — найпростіший і найдавніший навігаційн...
1,Винахід колеса є найбільшим досягненням в обла...,Винахід колеса є найбільшим досягненням в обла...
2,Cоціальні мережі стали невід’ємною частиною су...,Cоціальні мережі стали невід’ємною частиною су...
3,Синдром відмінника у дорослих часто проявляєть...,Синдром відмінника у дорослих часто проявляєть...
4,"Сьогодні кожна людина знає, що таке «Формула 1...","Сьогодні кожна людина знає, що таке «Формула 1..."
5,На початку ХІХ століття сім'я Беннет живе в св...,На початку ХІХ століття сім'я Беннет живе в св...
6,Події відбуваються наприкінці вісімнадцятого т...,Події відбуваються наприкінці вісімнадцятого т...


In [None]:
# Generate classical summaries
data['Classical_Summary'] = data['Text_Splitted'].apply(generate_summary)

In [None]:
# Generate CoD summaries
text_data = data['Text'].tolist()
result_step_3 = []
result_step_4 = []
result_step_5 = []


for i in text_data:
    summary_collection = DenserSummaryCollection.summarize(i)
    if len(i) > 4:
        result_step_3.append(summary_collection.summaries[2].denser_summary)
        result_step_4.append(summary_collection.summaries[3].denser_summary)
        result_step_5.append(summary_collection.summaries[4].denser_summary)
    else:
        result_step_3.append('Invalid input')
        result_step_4.append('Invalid input')
        result_step_5.append('Invalid input')

data['CoD_Summary_Step3'] = result_step_3
data['CoD_Summary_Step4'] = result_step_4
data['CoD_Summary_Step5'] = result_step_5

data

{'completion_tokens': 461, 'prompt_tokens': 2522, 'total_tokens': 2983, 'completion_tokens_details': {'reasoning_tokens': 0}}
{'completion_tokens': 513, 'prompt_tokens': 2342, 'total_tokens': 2855, 'completion_tokens_details': {'reasoning_tokens': 0}}
{'completion_tokens': 560, 'prompt_tokens': 1909, 'total_tokens': 2469, 'completion_tokens_details': {'reasoning_tokens': 0}}
{'completion_tokens': 648, 'prompt_tokens': 6011, 'total_tokens': 6659, 'completion_tokens_details': {'reasoning_tokens': 0}}
{'completion_tokens': 568, 'prompt_tokens': 1772, 'total_tokens': 2340, 'completion_tokens_details': {'reasoning_tokens': 0}}
{'completion_tokens': 595, 'prompt_tokens': 3152, 'total_tokens': 3747, 'completion_tokens_details': {'reasoning_tokens': 0}}
{'completion_tokens': 632, 'prompt_tokens': 2927, 'total_tokens': 3559, 'completion_tokens_details': {'reasoning_tokens': 0}}


Unnamed: 0,Text,Text_Splitted,Classical_Summary,CoD_Summary_Step3,CoD_Summary_Step4,CoD_Summary_Step5
0,Компас — найпростіший і найдавніший навігаційн...,Компас — найпростіший і найдавніший навігаційн...,The compass is the simplest and oldest navigat...,"The compass, originally a piece of magnetic or...",From its simple beginnings as a piece of magne...,"The compass, versatile and continuously evolvi..."
1,Винахід колеса є найбільшим досягненням в обла...,Винахід колеса є найбільшим досягненням в обла...,The invention of the wheel is considered a sig...,"The wheel, a groundbreaking Bronze Age inventi...","The wheel, a revolutionary Bronze Age inventio...","The wheel, a transformative Bronze Age inventi..."
2,Cоціальні мережі стали невід’ємною частиною су...,Cоціальні мережі стали невід’ємною частиною су...,Social media plays a significant role in moder...,"Social media, a key component of contemporary ...","Social media, ubiquitous in today's life, bols...","Social media, pervasive in modern existence, f..."
3,Синдром відмінника у дорослих часто проявляєть...,Синдром відмінника у дорослих часто проявляєть...,The 'Perfectionist Syndrome' in adults often m...,"Perfectionism syndrome, a complex psychologica...","Perfectionism syndrome, stemming from childhoo...","Perfectionism syndrome, distinct from perfecti..."
4,"Сьогодні кожна людина знає, що таке «Формула 1...","Сьогодні кожна людина знає, що таке «Формула 1...",Formula 1 is a popular annual car racing champ...,"Formula 1, a multi-stage racing event, sees dr...","Formula 1, a multi-stage event, pits drivers a...","Formula 1, a multi-stage contest, features dri..."
5,На початку ХІХ століття сім'я Беннет живе в св...,На початку ХІХ століття сім'я Беннет живе в св...,"In the early 19th century, the Bennet family l...","In Longbourn, England, the Bennet family's eco...",The Bennet family's economic stability in Long...,"In Longbourn, England, the Bennet family's eco..."
6,Події відбуваються наприкінці вісімнадцятого т...,Події відбуваються наприкінці вісімнадцятого т...,The story takes place in late 18th and early 1...,"In 'Wuthering Heights', Heathcliff, adopted by...","In 'Wuthering Heights', Heathcliff, adopted by...","In 'Wuthering Heights', Heathcliff's complex r..."


In [None]:
data.to_csv('result_16.09.csv', index=False)

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoTokenizer

In [None]:
# Define the model class:
BASE_MODEL = "Peltarion/xlm-roberta-longformer-base-4096"
class ReadabilityModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, model_name=BASE_MODEL):
        super(ReadabilityModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, 1)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

# Load the model:
model = ReadabilityModel.from_pretrained("trokhymovych/TRank_readability")

# Load the tokenizer:
tokenizer = AutoTokenizer.from_pretrained("trokhymovych/TRank_readability")

# Set the model to evaluation mode
model.eval()

# Define function to compute readability score
def get_readability_score(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(ids, mask)
        readability_score = outputs.item()
    return readability_score

# Apply the readability score function to the text column
data['Readability_Score_Classical'] = data['Classical_Summary'].apply(get_readability_score)
data['Readability_Score_CoD_3'] = data['CoD_Summary_Step3'].apply(get_readability_score)
data['Readability_Score_CoD_4'] = data['CoD_Summary_Step4'].apply(get_readability_score)
data['Readability_Score_CoD_5'] = data['CoD_Summary_Step5'].apply(get_readability_score)


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Peltarion/xlm-roberta-longformer-base-4096 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
