In [3]:
# Importing the necessary Libraries:

import pandas as pd
import numpy as np
import re
import io
import nltk
from nltk.tokenize import sent_tokenize 
nltk.download('punkt') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pankaj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Reading the csv file:

df=pd.read_csv('C:/Users/Pankaj/Desktop/GitHub Projects/Text Summarization/sports_news.csv')

In [6]:
# Exploring the sample data: 

df.head(5)

Unnamed: 0,article_id,article_text,source
0,1,Nadal has not played tennis since he was force...,https://www.express.co.uk/sport/tennis/1037119...
1,2,Federer won the Swiss Indoors last week by bea...,https://www.express.co.uk/sport/tennis/1038186...
2,3,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
3,4,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
4,5,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...


In [7]:
# Splitting the text from the bigger articles, into smaller sentences:

sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))  

In [8]:
# Flattening the list:

sentences = [y for x in sentences for y in x]

In [9]:
# Removing special characters, numbers and punctuations:

clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

In [10]:
# Converting the alphabets in lowercase:

clean_sentences = [s.lower() for s in clean_sentences]

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pankaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [13]:
# Creating the function for removing stopwords:

def remove_stopwords(sen):
  sen_new = " ".join([i for i in sen if i not in stop_words])
  return sen_new

In [14]:
# Removing the stopwords from the sentences:

clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [15]:
# Downloading the pretrained GloVe word embeddings:

import wget
file_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
file_name = wget.download(file_url)
file_name

100% [......................................................................] 862182613 / 862182613

'glove.6B (1).zip'

In [16]:
from zipfile import ZipFile 
  
# Specify the zip file name: 

file_name = "glove.6B.zip"
  
# Open the zip file in READ mode: 

with ZipFile(file_name, 'r') as zip:
    
    # Print all the contents of the zip file:
    
    zip.printdir() 
  
    # Extract all the files:
    
    print('Extracting all the files now...') 
    zip.extractall() 
    print('Done!') 


File Name                                             Modified             Size
glove.6B.50d.txt                               2014-08-04 13:15:00    171350079
glove.6B.100d.txt                              2014-08-04 13:14:34    347116733
glove.6B.200d.txt                              2014-08-04 13:14:44    693432828
glove.6B.300d.txt                              2014-08-27 12:19:16   1037962819
Extracting all the files now...
Done!


In [17]:
# Extracting the word vectors:

word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [18]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [19]:
len(sentence_vectors)

102

In [20]:
# Similarity matrix:

sim_mat = np.zeros([len(sentences), len(sentences)])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [23]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [24]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [28]:
# Specifying the number of sentences to form the summary:

sn = 20

# Generating the summary report of the articles:

for i in range(sn):
  print(ranked_sentences[i][1])

Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in London next month.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
He used his first break point to close out the first set before going up 3-0 in the second and wrapping up the win on his first match point.
The Spaniard broke Anderson twice in the second but didn't get another chance on the South African's serve in the final set.
Davenport enjoyed most of her success in the late 1990s and her third and final major tournament win came at the 2000 Australian Open.
"I felt like the best weeks that I had to get to know players when