<a href="https://colab.research.google.com/github/meetAmarAtGithub/Research-Papers/blob/main/Text_Summary/Text_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
from nltk.tokenize import sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity



def generate_summary(text, n):
    # Step 1: Tokenize the input text into sentences
    sentences = sent_tokenize(text)

    # Step 2: Construct graphs from the sentences
    graph = build_graph(sentences)

    # Step 3: Score the sentences and merge sentiments
    scores = score_sentences(sentences)
    sentiments = merge_sentiments(scores)

    # Step 4: Rank the sentences for summarization
    ranked_sentences = rank_sentences(graph, sentiments)

    # Generate the summary
    summary = ranked_sentences[:n]

    return ' '.join(summary)




def build_graph(sentences):
    graph = nx.Graph()
    graph.add_nodes_from(sentences)

    # Step 2: Construct graphs from the sentences
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity_score = calculate_similarity(sentences[i], sentences[j])

            if similarity_score > 0:
                graph.add_edge(sentences[i], sentences[j], weight=similarity_score)

    return graph

def calculate_similarity(sentence1, sentence2):
    vectorizer = CountVectorizer().fit_transform([sentence1, sentence2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)

    return similarity[0][1]


def score_sentences(sentences):
    sid = SentimentIntensityAnalyzer()
    scores = []

    for sentence in sentences:
        sentiment = sid.polarity_scores(sentence)
        scores.append(sentiment)

    return scores


def merge_sentiments(scores):
    merged_sentiment = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}

    for sentiment in scores:
        merged_sentiment['neg'] += sentiment['neg']
        merged_sentiment['neu'] += sentiment['neu']
        merged_sentiment['pos'] += sentiment['pos']
        merged_sentiment['compound'] += sentiment['compound']

    return merged_sentiment


def rank_sentences(graph, sentiments):
    ranked_sentences = nx.pagerank(graph, alpha=0.85, personalization=sentiments)

    # Sort the sentences based on their PageRank scores
    ranked_sentences = sorted(ranked_sentences, key=ranked_sentences.get, reverse=True)

    return ranked_sentences


# Example usage
text = "This is the first sentence. This is the second sentence. This is the third sentence."
#sentences = sent_tokenize(text)

#graph = build_graph(sentences)
#print(graph.edges(data=True))

summary = generate_summary(text, n=2)
print(summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


ZeroDivisionError: ignored

In [1]:
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
from nltk.tokenize import sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_graph(sentences):
    graph = nx.Graph()
    graph.add_nodes_from(range(len(sentences)))

    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity_score = calculate_similarity(sentences[i], sentences[j])

            if similarity_score > 0:
                graph.add_edge(i, j, weight=similarity_score)

    return graph

def calculate_similarity(sentence1, sentence2):
    vectorizer = CountVectorizer().fit_transform([sentence1, sentence2])
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)

    return similarity[0][1]

def generate_summary(text, num_sentences):
    sentences = sent_tokenize(text)

    if num_sentences >= len(sentences):
        return text

    graph = build_graph(sentences)
    scores = nx.pagerank(graph)

    ranked_sentences = sorted(((scores[i], i) for i in graph.nodes()), reverse=True)
    summary_sentences = [sentences[idx] for _, idx in ranked_sentences[:num_sentences]]

    summary = " ".join(summary_sentences)
    return summary

# Example usage
text = "Cerussite is a mineral consisting of lead carbonate (PbCO3), and is an important ore of lead. The name is from the Latin cerussa, white lead. Cerussa nativa was mentioned by Conrad Gessner in 1565, and in 1832 François Sulpice Beudant applied the name céruse to the mineral, while the present form, cerussite, is due to Wilhelm Karl Ritter von Haidinger in 1845. Miners' names for cerussite in early use were lead-spar and white-lead-ore. In a hydrate form known as white lead, the mineral is a key ingredient in lead paints and has also been used in cosmetics, but both uses are now discontinued in many places as a result of lead poisoning. These cerussite crystals, measuring approximately 4.0 cm × 3.0 cm × 2.0 cm (1.57 in × 1.18 in × 0.79 in), were found in a mine in Madan-e Nakhlak, Iran."
summary = generate_summary(text, num_sentences=1)
print(summary)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In a hydrate form known as white lead, the mineral is a key ingredient in lead paints and has also been used in cosmetics, but both uses are now discontinued in many places as a result of lead poisoning.


# Evaluate Model using "Opinosis" dataset

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%cd "/content/gdrive/My Drive/Colab Notebooks/Reva/Project Text Summary"

/content/gdrive/My Drive/Colab Notebooks/Reva/Project Text Summary


In [4]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [17]:
import os
from rouge import Rouge

def load_opinosis_data(data_dir):
    texts = []
    summaries = []

    # Iterate over the files in the topics directory
    topics_dir = os.path.join(data_dir, 'topics')
    print(topics_dir)
    for file_name in os.listdir(topics_dir):
        file_path = os.path.join(topics_dir, file_name)
        with open(file_path, 'r') as file:
            # Read the topic-based sentences
            topic_sentences = file.readlines()
            text = ' '.join(topic_sentences).strip()
            print(text)

            # Get the corresponding topic name
            temp = os.path.splitext(file_name)[0]
            topic_name = os.path.splitext(temp)[0]
            print(topic_name)

            # Load the corresponding human-composed summaries
            summaries_dir = os.path.join(data_dir, 'summaries-gold', topic_name)
            for summary_file in os.listdir(summaries_dir):
                summary_file_path = os.path.join(summaries_dir, summary_file)
                with open(summary_file_path, 'r', encoding='ISO-8859-1') as summary_file:
                    # Read the human-composed summary
                    summary = summary_file.readline().strip()

                    texts.append(text)
                    summaries.append(summary)

    return texts, summaries

def calculate_rouge_scores(reference_summaries, generated_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return scores

def calculate_metrics(rouge_scores):
    precision = rouge_scores['rouge-1']['p']
    recall = rouge_scores['rouge-1']['r']
    f1_score = rouge_scores['rouge-1']['f']
    return precision, recall, f1_score

# Set the path to the Opinosis dataset directory
opinosis_data_dir = '/content/gdrive/My Drive/Colab Notebooks/Reva/Project Text Summary/OpinosisDataset1.0_0'

# Load Opinosis benchmark data
reference_summaries, generated_summaries = load_opinosis_data(opinosis_data_dir)

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(reference_summaries, generated_summaries)

# Calculate precision, recall, and F1-score
precision, recall, f1_score = calculate_metrics(rouge_scores)

# Print results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")


/content/gdrive/My Drive/Colab Notebooks/Reva/Project Text Summary/OpinosisDataset1.0_0/topics
short battery life  I moved up from an 8gb .
 I love this ipod except for the battery life .
  long battery  scratch resistant
  Battery drains even if I don't use it .
  I only wonder why the battery seems to drain when I'm not using it, even after sliding the top control button to off when shutting down .
  great in the car, light, portable, good quality, long battery,scratch resistant .
 5G lies a more mature iPod, many steps wiser and more able than its one, year, old  The iPod gains many incremental improvements, including a brighter screen and better video battery life, but probably the most appealing aspect is the tantalizing price points of $249 for the 30GB version and $349 for the huge 80GB version   .
 5GB and the better battery life rated for up to 6 .
  battery, not dummie proof  I love the color of this iPod .
  Battery life isn't amazing, Definitely need to get a case for it  I

UnicodeDecodeError: ignored

In [18]:
!pip install chardet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import os
from rouge import Rouge
import chardet

def load_opinosis_data(data_dir):
    texts = []
    summaries = []

    # Iterate over the files in the topics directory
    topics_dir = os.path.join(data_dir, 'topics')
    for file_name in os.listdir(topics_dir):
        file_path = os.path.join(topics_dir, file_name)
        with open(file_path, 'rb') as file:
            # Detect the file's encoding
            raw_data = file.read()
            encoding = chardet.detect(raw_data)['encoding']

        with open(file_path, 'r', encoding=encoding) as file:
            # Read the topic-based sentences
            topic_sentences = file.readlines()
            text = ' '.join(topic_sentences).strip()

            # Get the corresponding topic name
            temp = os.path.splitext(file_name)[0]
            topic_name = os.path.splitext(temp)[0]

            # Load the corresponding human-composed summaries
            summaries_dir = os.path.join(data_dir, 'summaries-gold', topic_name)
            for summary_file in os.listdir(summaries_dir):
                summary_file_path = os.path.join(summaries_dir, summary_file)
                with open(summary_file_path, 'r', encoding=encoding) as summary_file:
                    # Read the human-composed summary
                    summary = summary_file.readline().strip()

                    texts.append(text)
                    summaries.append(summary)

    return texts, summaries


def calculate_rouge_scores(reference_summaries, generated_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
    return scores

def calculate_metrics(rouge_scores):
    precision = rouge_scores['rouge-1']['p']
    recall = rouge_scores['rouge-1']['r']
    f1_score = rouge_scores['rouge-1']['f']
    return precision, recall, f1_score

# Set the path to the Opinosis dataset directory
opinosis_data_dir = '/content/gdrive/My Drive/Colab Notebooks/Reva/Project Text Summary/OpinosisDataset1.0_0'

# Load Opinosis benchmark data
reference_summaries, generated_summaries = load_opinosis_data(opinosis_data_dir)

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(reference_summaries, generated_summaries)

# Calculate precision, recall, and F1-score
precision, recall, f1_score = calculate_metrics(rouge_scores)

# Print results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")


Precision: 0.8751275677526471
Recall: 0.01088230351487475
F1-Score: 0.021428472283800692
