<a href="https://colab.research.google.com/github/nijoluca/MLProjects/blob/main/TextSummarizationProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

def read_text(file_path):

    with open(file_path, 'r') as file:
        return file.read()

def preprocess_text(text):

    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))

    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
        preprocessed_sentences.append(words)

    return preprocessed_sentences

def sentence_similarity(sentence1, sentence2):

    all_words = list(set(sentence1 + sentence2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for word in sentence1:
        vector1[all_words.index(word)] += 1

    for word in sentence2:
        vector2[all_words.index(word)] += 1

    return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences):

    matrix = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                matrix[i][j] = sentence_similarity(sentences[i], sentences[j])

    return matrix

def generate_summary(text, num_sentences=5):
    """Generate a summary from the given text."""
    sentences = preprocess_text(text)
    sentence_similarity_matrix = build_similarity_matrix(sentences)

    # Calculate sentence scores based on similarity matrix
    sentence_scores = np.sum(sentence_similarity_matrix, axis=1)

    # Get indices of top sentences
    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]

    # Sort indices for correct order in the summary
    top_sentence_indices.sort()

    # Generate the summary
    summary = ' '.join([sentences[i] for i in top_sentence_indices])

    return summary

# Example usage:
file_path = 'your_text_file.txt'
text = read_text(file_path)
summary = generate_summary(text)

print("Original Text:")
print(text)
print("\nSummary:")
print(summary)


