In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
# nltk.download('punkt') # one time execution
import re

## Load & Read DataSet

In [2]:
# # Read the CSV file
# import io
# df = pd.read_csv(io.StringIO(uploaded['tennis_articles_v4.csv'].decode("utf-8")))
df= pd.read_csv('tennis_articles_v4.csv')

In [3]:
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


## Split Text into Sentences


In [4]:
# split the the text in the articles into sentences
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))  

In [6]:
# flatten the list
sentences = [y for x in sentences for y in x]

## Text Preprocessing

In [7]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [8]:
# nltk.download('stopwords')# one time execution

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [10]:
# function to remove stopwords
def remove_stopwords(sen):
  sen_new = " ".join([i for i in sen if i not in stop_words])
  return sen_new

In [11]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [16]:
# import requests

# # URL of the GloVe embeddings zip file
# url = "http://nlp.stanford.edu/data/glove.6B.zip"

# # Send a GET request to the URL
# response = requests.get(url)

# # Save the content of the response to a file
# with open("glove.6B.zip", "wb") as f:
#     f.write(response.content)


## Vector Representation of Sentences


In [21]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [22]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [23]:
len(sentence_vectors)

12718

## Similarity Matrix Preparation


The next step is to find similarities among the sentences. We will use cosine similarity to find similarity between a pair of sentences. Let's create an empty similarity matrix for this task and populate it with cosine similarities of the sentences.

In [24]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

## Applying TestRank Algorithm


In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

## Summary Extraction

In [26]:
# Specify number of sentences to form the summary
sn = 10

# Generate summary
for i in range(sn):
  print(ranked_sentences[i][1])

When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.
Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in London 

## text summary by user input

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import string

# Function to read and preprocess the text
def read_text(text):
    sentences = sent_tokenize(text)
    clean_sentences = [sent.lower() for sent in sentences]

    # Removing stopwords and punctuation
    stop_words = set(stopwords.words('english') + list(string.punctuation))
    cleaned_text = []
    for sent in clean_sentences:
        words = word_tokenize(sent)
        words = [word for word in words if word not in stop_words]
        cleaned_text.append(' '.join(words))
    return cleaned_text

# Function to calculate similarity matrix
def build_similarity_matrix(cleaned_text):
    similarity_matrix = np.zeros((len(cleaned_text), len(cleaned_text)))
    for i in range(len(cleaned_text)):
        for j in range(len(cleaned_text)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(cleaned_text[i], cleaned_text[j])
    return similarity_matrix

# Function to calculate similarity between sentences
def sentence_similarity(sent1, sent2):
    vector1 = set(sent1.split())
    vector2 = set(sent2.split())
    intersection = vector1.intersection(vector2)
    return len(intersection) / (np.log(len(vector1)) + np.log(len(vector2)) + 1)


# Function to generate summary using TextRank algorithm
def generate_summary(text, num_sentences=3):
    cleaned_text = read_text(text)
    similarity_matrix = build_similarity_matrix(cleaned_text)

    # Create a graph representation
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    # Sort sentences by score
    ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(cleaned_text)), reverse=True)

    # Get top sentences
    top_sentences = [sent for score, sent in ranked_sentences[:num_sentences]]

    return ' '.join(top_sentences)

# Example usage
text = """This project focuses on automating the task of text summarization using machine 
learning, specifically employing the TextRank algorithm. Text summarization involves 
condensing lengthy pieces of text while retaining essential information, and automation 
using machine learning is gaining traction in various applications, including text 
classification, question answering, legal text synthesis, news synthesis, and headline 
generation. """
summary = generate_summary(text)
print(summary)
