### Text summarization with TextRank

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') 
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maidu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('../../datasets/classifying-20-newsgroups/train.csv')
df.info()
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11314 entries, 0 to 11313
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       11314 non-null  int64 
 1   message  11314 non-null  object
 2   topic    11314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 265.3+ KB
(11314, 3)


In [3]:
df.head()

Unnamed: 0,id,message,topic
0,0,From: lerxst@wam.umd.edu (where's my thing)\r\...,7
1,1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,3,From: jgreen@amber (Joe Green)\r\nSubject: Re:...,1
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


In [5]:
df_space = df[df['topic']==14]

In [6]:
df_space.head()

Unnamed: 0,id,message,topic
4,4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
13,13,From: dbm0000@tm0006.lerc.nasa.gov (David B. M...,14
49,49,From: 18084TM@msu.edu (Tom)\r\nSubject: Golden...,14
59,59,From: leech@cs.unc.edu (Jon Leech)\r\nSubject:...,14
119,119,From: jbreed@doink.b23b.ingr.com (James B. Ree...,14


In [7]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df_space['message']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # list of sentences grabbed from df['article_text']


In [8]:
sentences[:4]

 'Verify no unexpected\r\n>>>errors.',
 '...".',
 'I am wondering what an "expected error" might\r\n>>>be.']

In [11]:
# remove punctuations, numbers and special characters
cleaned_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# turning everything to lowercase
cleaned_sentences = [s.lower() for s in cleaned_sentences]

  cleaned_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")


In [13]:
cleaned_sentences[1000]

' cosmos       satellite launched for defense ministry  moscow itar tass world service in russian       gmt    march       translated in fbis sov         p     by itar tass correspondent veronika romanenkova       moscow     march    the cosmos      satellite was launched at      moscow   time today from the baykonur by a  tsiklon m  carrier rocket '

#### Removing stopwords

In [16]:
from nltk.corpus import stopwords
nltk.download('stopwords')
cleaned_sentences = [word for word in cleaned_sentences if word not in stopwords.words('english')]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maidu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
cleaned_sentences[:4]

 'verify no unexpected     errors ',
 '     ',
 'i am wondering what an  expected error  might     be ']

#### GloVe Word Embeddings

In [18]:
# Extract word vectors
word_embeddings = {}
f = open('../glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [19]:
word_embeddings['apple']

array([-0.5985   , -0.46321  ,  0.13001  , -0.019576 ,  0.4603   ,
       -0.3018   ,  0.8977   , -0.65634  ,  0.66858  , -0.49164  ,
        0.037557 , -0.050889 ,  0.6451   , -0.53882  , -0.3765   ,
       -0.04312  ,  0.51384  ,  0.17783  ,  0.28596  ,  0.92063  ,
       -0.49349  , -0.48583  ,  0.61321  ,  0.78211  ,  0.19254  ,
        0.91228  , -0.055596 , -0.12512  , -0.65688  ,  0.068557 ,
        0.55629  ,  1.611    , -0.0073642, -0.48879  ,  0.45493  ,
        0.96105  , -0.063369 ,  0.17432  ,  0.9814   , -1.3125   ,
       -0.15801  , -0.54301  , -0.13888  , -0.26146  , -0.3691   ,
        0.26844  , -0.24375  , -0.19484  ,  0.62583  , -0.7377   ,
        0.38351  , -0.75004  , -0.39053  ,  0.091498 , -0.36591  ,
       -1.4715   , -0.45228  ,  0.2256   ,  1.1412   , -0.38526  ,
       -0.06716  ,  0.57288  , -0.39191  ,  0.31302  , -0.29235  ,
       -0.96157  ,  0.15154  , -0.21659  ,  0.25103  ,  0.096967 ,
        0.2843   ,  1.4296   , -0.50565  , -0.51374  , -0.4721

In [38]:
sentence_vectors = []
for i in cleaned_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

#### Similarity Matrix 

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

# measuring similarity between a pair of sentence
similarity_matrix  = np.zeros([len(sentences), len(sentences)],dtype='float')

for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      similarity_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

AttributeError: 'float' object has no attribute 'reshape'

#### Converting to graph

In [84]:
import networkx as nx
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [85]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
for i in range(10):
  print(ranked_sentences[i][1])

�I was on a nice trajectorythen,� Reid recalled.�If I hadn�t got sick, I think I could have started pushing towards the second week at the slams and then who knows.� Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
�Now they won�t even give me tickets (to the Open).� A Davis Cup pick and on the cusp of the world�s 100 by 18, Reid had few other complaints - just a lingering sense of what might have been.
Speaking at the Swiss Indoors tournament where he will play in Sunday�s final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
�I just felt like it really kind of changed where people were a little bit, definitely in the '90s, a lot

In [87]:
df['article_text'][7]

'I PLAYED golf last week with Todd Reid. He picked me up at 5.30am, half an hour early because he couldn�t sleep. Or hadn�t slept, to be specific. Not because he�d been out on a bender or anything � those days were in the past. The former Wimbledon junior champion was full of hope, excited about getting his life back together after a troubled few years and a touch-and-go battle with pancreatitis. �I�m pleased with that,� he said after grinding out an eight-over-par front nine at the not-so-royal Northbridge club in Sydney and smashing down an egg- and-bacon roll at the halfway house. To most players of his rare sporting gifts, such a modest return would be unacceptable. To Reid the 15-marker, just being up and at �em was enough; a few bogeys and one well-made par � broomstick putter and all � vindication for his recent decision to renew his membership at nearby Bankstown. Exhausted after spending half his round deep in the bushes searching for my ball, as well as those of two other gol