In [None]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
!pip install py-llm-core
!pip install openai==1.45.0

Collecting py-llm-core
  Downloading py_llm_core-3.5.0-py3-none-any.whl.metadata (18 kB)
Collecting tiktoken (from py-llm-core)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dirtyjson (from py-llm-core)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting llama-cpp-python>=0.2.84 (from py-llm-core)
  Downloading llama_cpp_python-0.3.2.tar.gz (65.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting python-decouple (from py-llm-core)
  Downloading python_decouple-3.8-py3-none-any.whl.metadata (14 kB)
Collecting mistralai (from py-llm-core)
  Downloading mistralai-1.2.2-py3-none-any.whl

In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4


In [None]:
# Import necessary libraries and modules
import pandas as pd
import unicodedata
from llm_core.splitters import TokenSplitter
from typing import List
from dataclasses import dataclass
from llm_core.assistants import OpenAIAssistant
import os
from google.colab import userdata
import zipfile
import openai
from openai import OpenAI

import textstat

In [None]:
# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = userdata.get('AIProject')

client = OpenAI(
    api_key = os.getenv("OPENAI_API_KEY"),
)

In [None]:
def cleanup_unicode(text):
    """
    Normalize Unicode characters in a given text to a standard form.

    This function takes a string input and processes each character to normalize
    it using Unicode normalization form KC (NFKC). NFKC stands for Normalization
    Form KC (Compatibility Composition), which ensures that characters are
    decomposed and then recomposed to their canonical composed form, replacing
    compatibility characters with their standard equivalents.

    Parameters:
    ----------
    text : str
        The input text that contains Unicode characters to be normalized.

    Returns:
    -------
    str
        The normalized text where all characters are converted to their
        compatibility composed form.
    """
    corrected_chars = []
    for char in text:
        corrected_char = unicodedata.normalize("NFKC", char)
        corrected_chars.append(corrected_char)

    return "".join(corrected_chars)


In [None]:
def t_splitter(text):
    """
    Splits a given text into chunks of a specified token size.

    This function uses the `TokenSplitter` class to divide the input text into
    chunks of up to 6,000 tokens, without any overlap between chunks. It returns
    the first chunk of text.

    Parameters:
    ----------
    text : str
        The input text that needs to be split into smaller chunks.

    Returns:
    -------
    str
        The first chunk of text, limited to 6,000 tokens.
    """
    splitter = TokenSplitter(chunk_size=6_000, chunk_overlap=0)
    text_result = next(splitter.chunkify(text))

    return text_result


In [None]:
@dataclass
class DenseSummary:
    denser_summary: str
    missing_entities: List[str]


@dataclass
class DenserSummaryCollection:
  system_prompt = """
  You are an expert in writing rich and dense summaries in broad domains.
  """

  prompt = """
  Article:

  {article}

  ----

  You will generate increasingly concise, entity-dense summaries of the above
  Article.

  Repeat the following 2 steps 5 times.

  - Step 1: Identify 1-3 informative Entities from the Article
  which are missing from the previously generated summary and are the most
  relevant.

  - Step 2: Write a new, denser summary of identical length which covers
  every entity and detail from the previous summary plus the missing entities

  A Missing Entity is:

  - Relevant: to the main story
  - Specific: descriptive yet concise (5 words or fewer)
  - Novel: not in the previous summary
  - Faithful: present in the Article
  - Anywhere: located anywhere in the Article

  Guidelines:
  - The first summary should be long (4-5 sentences, approx. 80 words) yet
  highly non-specific, containing little information beyond the entities
  marked as missing.

  - Use overly verbose language and fillers (e.g. "this article discusses") to
  reach approx. 80 words.

  - Make every word count: re-write the previous summary to improve flow and
  make space for additional entities.

  - Make space with fusion, compression, and removal of uninformative phrases
  like "the article discusses"

  - The summaries should become highly dense and concise yet self-contained,
  e.g., easily understood without the Article.

  - Missing entities can appear anywhere in the new summary.

  - Never drop entities from the previous summary. If space cannot be made,
  add fewer new entities.

  > Remember to use the exact same number of words for each summary.
  Answer in JSON.

  > The JSON in `summaries_per_step` should be a list (length 5) of
  dictionaries whose keys are "missing_entities" and "denser_summary".

  """

  summaries: List[DenseSummary]


  @classmethod
  def summarize(cls, article):
      with OpenAIAssistant(cls, model='gpt-4') as assistant:
          return assistant.process(article=article)



def generate_summary(input_text, max_tokens=50):
    system = [{"role": "system", "content": "You are Summary AI."}]
    user = [{"role": "user", "content": f"Summarize this briefly:\n\n{input_text}"}]

    chat_completion = client.chat.completions.create(
    messages = system + user,
    model="gpt-4",
    max_tokens=500, top_p=0.9,
    )
    return chat_completion.choices[0].message.content

In [None]:
# zip_file_path = '/content/AI Project Texts.zip'

# texts = []

# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     for file_info in zip_ref.infolist():
#         if file_info.filename.endswith('.txt'):
#             with zip_ref.open(file_info) as file:
#                 text = file.read().decode('utf-8')
#                 cleaned_text = cleanup_unicode(text)
#                 texts.append(cleaned_text)

# data = pd.DataFrame(texts, columns=['Text'])
data = pd.read_csv('/content/articles.csv', on_bad_lines='skip', nrows=50)
data = data.rename(columns={'text': 'Text'})[['Text']]

In [None]:
# Apply the t_splitter function to the 'Text' column of the DataFrame
data['Text_Splitted'] = data['Text'].apply(t_splitter)
data

Unnamed: 0,Text,Text_Splitted
0,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,Photo by Josh Riemer on Unsplash\n\nMerry Chri...
1,Your Brain On Coronavirus\n\nA guide to the cu...,Your Brain On Coronavirus\n\nA guide to the cu...
2,Mind Your Nose\n\nHow smell training can chang...,Mind Your Nose\n\nHow smell training can chang...
3,Passionate about the synergy between science a...,Passionate about the synergy between science a...
4,"You’ve heard of him, haven’t you? Phineas Gage...","You’ve heard of him, haven’t you? Phineas Gage..."
5,"Mentally, Young Adults Are Suffering Most From...","Mentally, Young Adults Are Suffering Most From..."
6,How to Turn Your Popular Blog Series Into a Be...,How to Turn Your Popular Blog Series Into a Be...
7,Dr Faisal Dar — Pioneer of Liver Transplantati...,Dr Faisal Dar — Pioneer of Liver Transplantati...
8,Sunlight — The Natural Supplement For Our Ment...,Sunlight — The Natural Supplement For Our Ment...
9,Occam’s dice\n\nDistrusting biological metapho...,Occam’s dice\n\nDistrusting biological metapho...


In [None]:
# Generate classical summaries
data['Classical_Summary'] = data['Text_Splitted'].apply(generate_summary)

In [None]:
# Generate CoD summaries
text_data = data['Text'].tolist()
result_step_3 = []
result_step_4 = []
result_step_5 = []

c = 1

for i in text_data:
    summary_collection = DenserSummaryCollection.summarize(i)
    if len(i) > 4:
        result_step_3.append(summary_collection.summaries[2].denser_summary)
        result_step_4.append(summary_collection.summaries[3].denser_summary)
        result_step_5.append(summary_collection.summaries[4].denser_summary)
    else:
        result_step_3.append('Invalid input')
        result_step_4.append('Invalid input')
        result_step_5.append('Invalid input')

    print(c)
    c += 1

data['CoD_Summary_Step3'] = result_step_3
data['CoD_Summary_Step4'] = result_step_4
data['CoD_Summary_Step5'] = result_step_5

data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


Unnamed: 0,Text,Text_Splitted,Classical_Summary,CoD_Summary_Step3,CoD_Summary_Step4,CoD_Summary_Step5
0,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,This is a Christmas message expressing appreci...,A team's holiday greeting appreciates readers'...,A holiday greeting thanks readers for supporti...,A holiday greeting thanks readers for supporti...
1,Your Brain On Coronavirus\n\nA guide to the cu...,Your Brain On Coronavirus\n\nA guide to the cu...,The coronavirus pandemic has complex and broad...,Coronavirus enters cells via the ACE2 receptor...,"Coronavirus, entering cells via the ACE2 recep...","Coronavirus, exploiting the ACE2 receptor, ind..."
2,Mind Your Nose\n\nHow smell training can chang...,Mind Your Nose\n\nHow smell training can chang...,"The olfactory system, responsible for the sens...",Smell training stimulates the olfactory system...,"Smell training enhances the olfactory system, ...","Smell training fortifies the olfactory system,..."
3,Passionate about the synergy between science a...,Passionate about the synergy between science a...,The individual is passionate about the collabo...,A fervent advocate for the amalgamation of sci...,An ardent proponent of science and technology'...,An impassioned champion of the union of scienc...
4,"You’ve heard of him, haven’t you? Phineas Gage...","You’ve heard of him, haven’t you? Phineas Gage...","Phineas Gage, a 25-year-old railroad worker, s...","Phineas Gage, a 25-year-old Vermont railroad w...","Phineas Gage, 25, a Vermont railroad worker, s...","Phineas Gage, 25, a Vermont railroad worker, s..."
5,"Mentally, Young Adults Are Suffering Most From...","Mentally, Young Adults Are Suffering Most From...",Young adults have suffered the most mental hea...,Holman et al.'s study reveals COVID-19's sever...,COVID-19's mental toll on young adults is seve...,Holman et al. reveal COVID-19's severe mental ...
6,How to Turn Your Popular Blog Series Into a Be...,How to Turn Your Popular Blog Series Into a Be...,The article discusses the process of convertin...,Successfully converting a blog into a book req...,"To convert a blog into a bestselling book, one...",Converting a blog into a bestselling book invo...
7,Dr Faisal Dar — Pioneer of Liver Transplantati...,Dr Faisal Dar — Pioneer of Liver Transplantati...,"Dr. Faisal Dar, a pioneer of liver transplanta...","Dr. Faisal Dar, who initiated the liver transp...","Dr. Faisal Dar, from Allama Iqbal Medical Coll...","Dr. Faisal Dar, an alumnus of Allama Iqbal Med..."
8,Sunlight — The Natural Supplement For Our Ment...,Sunlight — The Natural Supplement For Our Ment...,Exposure to sunlight plays a crucial role in m...,Sunlight enhances mental health by boosting se...,"Sunlight bolsters mental health via serotonin,...",Sunlight enhances mental health through seroto...
9,Occam’s dice\n\nDistrusting biological metapho...,Occam’s dice\n\nDistrusting biological metapho...,The article discusses the principle of Occam's...,The text scrutinizes the use of Occam's razor ...,The article critically examines Occam's razor'...,The piece critically assesses Occam's razor's ...


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoTokenizer

In [None]:
# Define the model class:
BASE_MODEL = "Peltarion/xlm-roberta-longformer-base-4096"
class ReadabilityModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, model_name=BASE_MODEL):
        super(ReadabilityModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, 1)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

# Load the model:
model = ReadabilityModel.from_pretrained("trokhymovych/TRank_readability")

# Load the tokenizer:
tokenizer = AutoTokenizer.from_pretrained("trokhymovych/TRank_readability")

# Set the model to evaluation mode
model.eval()

# Define function to compute readability score
def get_readability_score(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(ids, mask)
        readability_score = outputs.item()
    return readability_score

# Apply the readability score function to the text column
data['Readability_Score_trokhymovych_Classical'] = data['Classical_Summary'].apply(get_readability_score)
data['Readability_Score_trokhymovych_CoD_3'] = data['CoD_Summary_Step3'].apply(get_readability_score)
data['Readability_Score_trokhymovych_CoD_4'] = data['CoD_Summary_Step4'].apply(get_readability_score)
data['Readability_Score_trokhymovych_CoD_5'] = data['CoD_Summary_Step5'].apply(get_readability_score)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Peltarion/xlm-roberta-longformer-base-4096 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [None]:
# Apply the readability score function to the text column
data['Readability_Score_flesch_reading_ease_Classical'] = data['Classical_Summary'].apply(textstat.flesch_reading_ease)
data['Readability_Score_flesch_reading_ease_CoD_3'] = data['CoD_Summary_Step3'].apply(textstat.flesch_reading_ease)
data['Readability_Score_flesch_reading_ease_CoD_4'] = data['CoD_Summary_Step4'].apply(textstat.flesch_reading_ease)
data['Readability_Score_flesch_reading_ease_CoD_5'] = data['CoD_Summary_Step5'].apply(textstat.flesch_reading_ease)

In [None]:
# Apply the readability score function to the text column
data['Readability_Score_mcalpine_eflaw_Classical'] = data['Classical_Summary'].apply(textstat.mcalpine_eflaw)
data['Readability_Score_mcalpine_eflaw_CoD_3'] = data['CoD_Summary_Step3'].apply(textstat.mcalpine_eflaw)
data['Readability_Score_mcalpine_eflaw_CoD_4'] = data['CoD_Summary_Step4'].apply(textstat.mcalpine_eflaw)
data['Readability_Score_mcalpine_eflaw_CoD_5'] = data['CoD_Summary_Step5'].apply(textstat.mcalpine_eflaw)

In [None]:
# Apply the readability score function to the text column
data['Readability_Score_dale_chall_readability_score_Classical'] = data['Classical_Summary'].apply(textstat.dale_chall_readability_score)
data['Readability_Score_dale_chall_readability_score_CoD_3'] = data['CoD_Summary_Step3'].apply(textstat.dale_chall_readability_score)
data['Readability_Score_dale_chall_readability_score_CoD_4'] = data['CoD_Summary_Step4'].apply(textstat.dale_chall_readability_score)
data['Readability_Score_dale_chall_readability_score_CoD_5'] = data['CoD_Summary_Step5'].apply(textstat.dale_chall_readability_score)

In [None]:
data.to_csv('result_18.11.csv', index=False)