### Text summarization with TextRank

In [70]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') 
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maidu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [71]:
df = pd.read_csv('../../datasets/tennis/tennis_articles.csv')
df.info()
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   article_id     8 non-null      int64 
 1   article_title  8 non-null      object
 2   article_text   8 non-null      object
 3   source         8 non-null      object
dtypes: int64(1), object(3)
memory usage: 384.0+ bytes
(8, 4)


In [72]:
df.head()

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in�tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP) � Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [73]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # list of sentences grabbed from df['article_text']


In [74]:
sentences[:4]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net."]

In [75]:
# remove punctuations, numbers and special characters
cleaned_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# turning everything to lowercase
cleaned_sentences = [s.lower() for s in clean_sentences]

  cleaned_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")


In [76]:
cleaned_sentences[:4]

['maria sharapova has basically no friends as tennis players on the wta tour ',
 'the russian player has no problems in openly speaking about it and in a recent interview she said   i don t really hide any feelings too much ',
 'i think everyone knows this is my job here ',
 'when i m on the courts or when i m on the court playing  i m a competitor and i want to beat every single person whether they re in the locker room or across the net ']

In [77]:
from nltk.corpus import stopwords
nltk.download('stopwords')
cleaned_sentences = [word for word in cleaned_sentences if word not in stopwords.words('english')]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maidu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [78]:
cleaned_sentences[:4]

['maria sharapova has basically no friends as tennis players on the wta tour ',
 'the russian player has no problems in openly speaking about it and in a recent interview she said   i don t really hide any feelings too much ',
 'i think everyone knows this is my job here ',
 'when i m on the courts or when i m on the court playing  i m a competitor and i want to beat every single person whether they re in the locker room or across the net ']

#### GloVe Word Embeddings

In [79]:
# Extract word vectors
word_embeddings = {}
f = open('../glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [80]:
word_embeddings['apple']

array([-0.5985   , -0.46321  ,  0.13001  , -0.019576 ,  0.4603   ,
       -0.3018   ,  0.8977   , -0.65634  ,  0.66858  , -0.49164  ,
        0.037557 , -0.050889 ,  0.6451   , -0.53882  , -0.3765   ,
       -0.04312  ,  0.51384  ,  0.17783  ,  0.28596  ,  0.92063  ,
       -0.49349  , -0.48583  ,  0.61321  ,  0.78211  ,  0.19254  ,
        0.91228  , -0.055596 , -0.12512  , -0.65688  ,  0.068557 ,
        0.55629  ,  1.611    , -0.0073642, -0.48879  ,  0.45493  ,
        0.96105  , -0.063369 ,  0.17432  ,  0.9814   , -1.3125   ,
       -0.15801  , -0.54301  , -0.13888  , -0.26146  , -0.3691   ,
        0.26844  , -0.24375  , -0.19484  ,  0.62583  , -0.7377   ,
        0.38351  , -0.75004  , -0.39053  ,  0.091498 , -0.36591  ,
       -1.4715   , -0.45228  ,  0.2256   ,  1.1412   , -0.38526  ,
       -0.06716  ,  0.57288  , -0.39191  ,  0.31302  , -0.29235  ,
       -0.96157  ,  0.15154  , -0.21659  ,  0.25103  ,  0.096967 ,
        0.2843   ,  1.4296   , -0.50565  , -0.51374  , -0.4721

In [81]:
sentence_vectors = []
for i in cleaned_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [82]:
sentence_vectors[0]

array([-0.00708815,  0.09256166,  0.63329273, -0.02311481,  0.01546605,
        0.1853495 ,  0.09308354,  0.42582178, -0.24139106, -0.08799385,
        0.24989085,  0.05680139,  0.1477317 ,  0.19214791, -0.11052089,
       -0.13337198,  0.20583408,  0.11695171, -0.28442192,  0.33139458,
        0.12599568,  0.1967534 ,  0.15575756,  0.5180779 ,  0.30478284,
       -0.12746389,  0.04639504, -0.7776317 ,  0.37259623,  0.02550103,
       -0.28453514,  0.3427335 ,  0.09285065,  0.07136673,  0.24223588,
        0.153613  , -0.29634136,  0.39717883, -0.23219974, -0.1128767 ,
       -0.32462594, -0.1468936 ,  0.35839075, -0.3026954 ,  0.16459417,
       -0.15502267, -0.01870048, -0.2684455 ,  0.2881759 , -0.50023335,
       -0.15034528, -0.2343842 ,  0.16917214,  0.63483965, -0.13028543,
       -2.1654103 , -0.06345838,  0.08818175,  1.0153687 ,  0.8081116 ,
       -0.23266864,  0.57686883, -0.20425446,  0.23006882,  0.33329207,
       -0.12515107,  0.155755  ,  0.3699625 ,  0.16224106, -0.14

#### Similarity Matrix 

In [83]:
from sklearn.metrics.pairwise import cosine_similarity

# measuring similarity between a pair of sentence
similarity_matrix  = np.zeros([len(sentences), len(sentences)])

for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      similarity_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

#### Converting to graph

In [84]:
import networkx as nx
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [85]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
for i in range(10):
  print(ranked_sentences[i][1])

�I was on a nice trajectorythen,� Reid recalled.�If I hadn�t got sick, I think I could have started pushing towards the second week at the slams and then who knows.� Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
�Now they won�t even give me tickets (to the Open).� A Davis Cup pick and on the cusp of the world�s 100 by 18, Reid had few other complaints - just a lingering sense of what might have been.
Speaking at the Swiss Indoors tournament where he will play in Sunday�s final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
�I just felt like it really kind of changed where people were a little bit, definitely in the '90s, a lot

In [87]:
df['article_text'][7]

'I PLAYED golf last week with Todd Reid. He picked me up at 5.30am, half an hour early because he couldn�t sleep. Or hadn�t slept, to be specific. Not because he�d been out on a bender or anything � those days were in the past. The former Wimbledon junior champion was full of hope, excited about getting his life back together after a troubled few years and a touch-and-go battle with pancreatitis. �I�m pleased with that,� he said after grinding out an eight-over-par front nine at the not-so-royal Northbridge club in Sydney and smashing down an egg- and-bacon roll at the halfway house. To most players of his rare sporting gifts, such a modest return would be unacceptable. To Reid the 15-marker, just being up and at �em was enough; a few bogeys and one well-made par � broomstick putter and all � vindication for his recent decision to renew his membership at nearby Bankstown. Exhausted after spending half his round deep in the bushes searching for my ball, as well as those of two other gol