## Text Suammarization

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from lexrank import STOPWORDS, LexRank

In [24]:
content = pd.read_csv('data/flu_data_topic_cleaned.csv')
content

Unnamed: 0,Year,Topic,Content
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h..."
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t..."
...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ..."
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...


### Lex Rank

In [25]:
documents = content['Content'].values

len(documents)


141

In [26]:
all_tokenized_sentences = [sent_tokenize(str(document)) for document in documents]

In [27]:
len(all_tokenized_sentences)

141

In [28]:
for sentences in all_tokenized_sentences:

    lxr = LexRank(sentences)

    scores_cont = lxr.rank_sentences(
        sentences,
        threshold=None,
        fast_power_method=True,
    )

print(scores_cont)

[0.93128686 0.91516357 0.8858935  1.15790892 1.10974716]


In [29]:
for sentences in all_tokenized_sentences:

    lxr = LexRank(sentences)

    summary = lxr.get_summary(sentences, summary_size=2, threshold=0.05)
    
    print(summary)

['Regional flu activity is defined as increased flu-like activity or flu outbreaks in at least two (but fewer than half) of the regions in a state with recent laboratory evidence of flu in those regions.', 'The first report of regional flu activity came from the southeastern United States during the first week of November.']
['Flu activity increased during late December, peaked in mid-February, and decreased through the end of the flu season on May 19.', 'Nationally, low levels of flu activity were reported during October through mid-December.']
['The severity of a flu season can be judged according to a variety of criteria, such as the following:\\nThe geographic extent of influenza in the U.S. and within each state;\\nThe proportion of influenza laboratory tests that are positive;\\nThe proportion of visits to physicians for influenza-like illness;\\nThe proportion of all deaths that are caused by pneumonia and flu;\\nThe number of influenza-associated deaths among children; and\\nTh

In [30]:
summaries = []

for sentences in all_tokenized_sentences:
    lxr = LexRank(sentences)
    summary_cont = lxr.get_summary(sentences, summary_size=2, threshold=0.05)
    
    summaries.append(summary_cont[0] if summary_cont else None)

content['LexRank'] = summaries

In [31]:
content

Unnamed: 0,Year,Topic,Content,LexRank
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...,Regional flu activity is defined as increased ...
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...,"Flu activity increased during late December, p..."
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h...",The severity of a flu season can be judged acc...
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...,"From October 1, 2006 to May 19, 2007, widespre..."
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t...",Although the timing of peak activity varies fr...
...,...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...,A couple of things are different for the 2023-...
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ...",Flu vaccination is often available at no or lo...
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...,CDC is not involved in regulatory decision-mak...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...,It is also possible to be sick with multiple f...


### Save to CSV file

In [32]:
content.to_csv('data/flu_data_summary_compare.csv', index=False)

In [11]:
for sentences in all_tokenized_sentences:

    lxr = LexRank(sentences)

    summary_cont = lxr.get_summary(sentences, threshold=None)
    
    print(summary_cont)

['Regional flu activity is defined as increased flu-like activity or flu outbreaks in at least two (but fewer than half) of the regions in a state with recent laboratory evidence of flu in those regions.']
['For example, the proportion of all deaths associated with influenza illness was lower this season than the previous three flu seasons.']
['The severity of a flu season can be judged according to a variety of criteria, such as the following:\\nThe geographic extent of influenza in the U.S. and within each state;\\nThe proportion of influenza laboratory tests that are positive;\\nThe proportion of visits to physicians for influenza-like illness;\\nThe proportion of all deaths that are caused by pneumonia and flu;\\nThe number of influenza-associated deaths among children; and\\nThe influenza-associated hospitalization rate among children.\\nA flu season�s severity is determined by comparing these measures with previous seasons.']
['From October 1, 2006 to May 19, 2007, widespread** f

### Text Rank

In [57]:
!pip install summa



In [85]:
text = content['Content'].values[1]
print(text)

The 2006-07 flu season was generally mild compared to recent flu seasons. For example, the proportion of all deaths associated with influenza illness was lower this season than the previous three flu seasons. Hospitalization rates among children were also lower than the previous three flu seasons. However, more pediatric deaths related to influenza were reported during the 2006-07 season than the previous two seasons. Nationally, low levels of flu activity were reported during October through mid-December. Flu activity increased during late December, peaked in mid-February, and decreased through the end of the flu season on May 19.


In [86]:
from summa.summarizer import summarize
summarize(text)

'For example, the proportion of all deaths associated with influenza illness was lower this season than the previous three flu seasons.'

In [4]:
def summarize(text):
    from summa.summarizer import summarize
    return summarize(text)

In [81]:
content['TextRank'] = content['Content'].apply(summarize)

In [82]:
content

Unnamed: 0,Year,Topic,Content,LexRank,TextRank
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...,Regional flu activity is defined as increased ...,
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...,"Flu activity increased during late December, p...","For example, the proportion of all deaths asso..."
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h...",The severity of a flu season can be judged acc...,
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...,"From October 1, 2006 to May 19, 2007, widespre...",
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t...",Although the timing of peak activity varies fr...,
...,...,...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...,A couple of things are different for the 2023-...,
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ...",Flu vaccination is often available at no or lo...,Flu vaccination is often available at no or lo...
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...,CDC is not involved in regulatory decision-mak...,Quadrivalent flu vaccines protect against four...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...,It is also possible to be sick with multiple f...,


In [87]:
# Count the non-empty summaries
count_non_empty_summaries = content['TextRank'].apply(len).gt(0).sum()
print(f"Number of non-empty summaries: {count_non_empty_summaries}")

Number of non-empty summaries: 70


In [88]:
content.to_csv('data/flu_data_summary_compare.csv', index=False)