## Text Suammarization

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from lexrank import STOPWORDS, LexRank

In [3]:
content = pd.read_csv('data/flu_data_topic_cleaned.csv')
content

Unnamed: 0,Year,Topic,Content
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h..."
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t..."
...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ..."
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...


### Lex Rank

In [4]:
documents = content['Content'].values

len(documents)


141

In [5]:
all_tokenized_sentences = [sent_tokenize(str(document)) for document in documents]

In [6]:
len(all_tokenized_sentences)

141

In [7]:
for sentences in all_tokenized_sentences:

    lxr = LexRank(sentences)

    scores_cont = lxr.rank_sentences(
        sentences,
        threshold=None,
        fast_power_method=True,
    )

    print(scores_cont)

[1. 1.]
[1.02248742 1.10929753 1.00336564 1.05049071 0.78764082 1.02671788]
[1. 1.]
[1. 1.]
[1.08774137 1.10064971 0.81160892]
[0.91212052 0.93318623 1.02106161 0.99427119 1.12449406 1.1252169
 1.16039342 0.72925608]
[1.09011562 1.11969352 0.86646817 0.92372269]
[1.03644534 0.84351606 0.96020766 1.11921817 1.07632299 0.99786103
 0.96642874]
[0.9756008  0.97200254 1.05239666]
[0.95015923 0.69496309 0.99397656 0.92689799 0.85390525 1.04811204
 1.10591665 1.01025247 1.19507125 1.10503415 1.11571131]
[1. 1.]
[1. 1.]
[1. 1.]
[1. 1.]
[0.98011586 0.9872536  1.03263054]
[1. 1.]
[0.9843219  1.16762751 1.07018641 1.01740531 0.76045888]
[0.57075795 1.09616619 0.93942042 1.00573697 0.68416    1.21503122
 1.19577343 1.36574514 1.04644866 0.66414467 1.36911091 1.29483442
 0.61871564 0.69063099 1.2433234 ]
[0.96782825 1.11825252 1.16138387 1.0237552  0.72878017]
[0.74991273 1.1867324  0.94976209 1.11359278]
[0.988503   0.95824556 1.23689327 1.01300907 0.70216101 1.10118809]
[1.08754767 1.03819044 1.1

In [8]:
for sentences in all_tokenized_sentences:

    lxr = LexRank(sentences)

    summary = lxr.get_summary(sentences, summary_size=2, threshold=0.05)
    
    print(summary)

['Regional flu activity is defined as increased flu-like activity or flu outbreaks in at least two (but fewer than half) of the regions in a state with recent laboratory evidence of flu in those regions.', 'The first report of regional flu activity came from the southeastern United States during the first week of November.']
['Flu activity increased during late December, peaked in mid-February, and decreased through the end of the flu season on May 19.', 'Nationally, low levels of flu activity were reported during October through mid-December.']
['The severity of a flu season can be judged according to a variety of criteria, such as the following:\\nThe geographic extent of influenza in the U.S. and within each state;\\nThe proportion of influenza laboratory tests that are positive;\\nThe proportion of visits to physicians for influenza-like illness;\\nThe proportion of all deaths that are caused by pneumonia and flu;\\nThe number of influenza-associated deaths among children; and\\nTh

In [9]:
summaries = []

for sentences in all_tokenized_sentences:
    lxr = LexRank(sentences)
    summary_cont = lxr.get_summary(sentences, summary_size=2, threshold=0.05)
    
    summaries.append(summary_cont[0] if summary_cont else None)

content['Summary'] = summaries

In [10]:
content

Unnamed: 0,Year,Topic,Content,Summary
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...,Regional flu activity is defined as increased ...
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...,"Flu activity increased during late December, p..."
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h...",The severity of a flu season can be judged acc...
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...,"From October 1, 2006 to May 19, 2007, widespre..."
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t...",Although the timing of peak activity varies fr...
...,...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...,A couple of things are different for the 2023-...
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ...",Flu vaccination is often available at no or lo...
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...,CDC is not involved in regulatory decision-mak...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...,It is also possible to be sick with multiple f...


### Save to CSV file

In [None]:
content.to_csv('data/flu_data_summary.csv', index=False)

In [11]:
for sentences in all_tokenized_sentences:

    lxr = LexRank(sentences)

    summary_cont = lxr.get_summary(sentences, threshold=None)
    
    print(summary_cont)

['Regional flu activity is defined as increased flu-like activity or flu outbreaks in at least two (but fewer than half) of the regions in a state with recent laboratory evidence of flu in those regions.']
['For example, the proportion of all deaths associated with influenza illness was lower this season than the previous three flu seasons.']
['The severity of a flu season can be judged according to a variety of criteria, such as the following:\\nThe geographic extent of influenza in the U.S. and within each state;\\nThe proportion of influenza laboratory tests that are positive;\\nThe proportion of visits to physicians for influenza-like illness;\\nThe proportion of all deaths that are caused by pneumonia and flu;\\nThe number of influenza-associated deaths among children; and\\nThe influenza-associated hospitalization rate among children.\\nA flu season�s severity is determined by comparing these measures with previous seasons.']
['From October 1, 2006 to May 19, 2007, widespread** f

### Text Rank

In [19]:
!pip install summa

Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25ldone
[?25h  Created wheel for summa: filename=summa-1.2.0-py3-none-any.whl size=54389 sha256=ac8efb6a486344a20db1e23703a954d32a9e8ef5d17ff8e53cdab3a1d1ad341f
  Stored in directory: /Users/tongfah/Library/Caches/pip/wheels/10/2d/7a/abce87c4ea233f8dcca0d99b740ac0257eced1f99a124a0e1f
Successfully built summa
Installing collected packages: summa
Successfully installed summa-1.2.0


In [12]:
text = content['Content'].values[0]

In [13]:
from summa.summarizer import summarize
summarizer = summarize(text, ratio=0.05)

In [14]:
print(text)

The first report of regional flu activity came from the southeastern United States during the first week of November. Regional flu activity is defined as increased flu-like activity or flu outbreaks in at least two (but fewer than half) of the regions in a state with recent laboratory evidence of flu in those regions.


In [15]:
print(summarizer)


