In [1]:
import pandas as pd
import numpy as np
import spacy
import os
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS

## Read rss data

In [2]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [3]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [4]:
len(data_list)

198

In [5]:
data_list[0].text

0    
Name: text, dtype: object

In [6]:
data_list[0].text.values

array([''], dtype=object)

In [7]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [8]:
len(all_text)

147

In [9]:
nlp = spacy.load('en_core_web_sm')

## Using TextRank4zh package to find most frequent words

https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0

In [23]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'VERB'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [43]:
text = all_text[5][0]

In [44]:
text

'It’s been seven years since the online cheating site AshleyMadison.com was hacked and highly sensitive data about its users posted online. The leak led to the public shaming and extortion of many Ashley Madison users, and to at least two suicides. To date, little is publicly known about the perpetrators or the true motivation for the attack. But a recent review of Ashley Madison mentions across Russian cybercrime forums and far-right websites in the months leading up to the hack revealed some previously unreported details that may deserve further scrutiny.As first reported by KrebsOnSecurity on July 19, 2015, a group calling itself the “Impact Team” released data sampled from millions of users, as well as maps of internal company servers, employee network account information, company bank details and salary information.The Impact Team said it decided to publish the information because ALM “profits on the pain of others,” and in response to a paid “full delete” service Ashley Madison p

### List of  Words Based on Highest Rank

In [45]:
tr4w = TextRank4Keyword()
tr4w.analyze(text,window_size=4, lower=False)
tr4w.get_keywords(30)

company - 7.589295693480497
site - 6.154209377686638
hack - 5.732488321445461
data - 5.61719372763126
information - 5.102300081302323
users - 4.790585616359165
profiles - 4.733112114488079
posted - 3.900088384979066
found - 3.099169547345169
user - 3.0297810056041614
story - 3.0182559540082154
privacy - 2.8715554831538475
months - 2.754811006353687
emails - 2.705425465007989
people - 2.6722515338094017
search - 2.5433179154044705
time - 2.5118204489695115
CEO - 2.493741225821457
money - 2.479358098664795
thread - 2.469216254894016
men - 2.3995155143761235
members - 2.3950632529393574
employee - 2.381773394845253
websites - 2.3275803163059057
% - 2.3250277292855595
account - 2.320423141322904
bunch - 2.25546039168765
details - 2.2368378653360756
cybercrime - 2.229782066882456
adultery - 2.2083766980735646
leaked - 2.195036261247422
leading - 2.1612581723769546


### List of  Verbs and Nouns  Based on Highest Rank

In [46]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos=['NOUN', 'VERB'],window_size=4, lower=False)
tr4w.get_keywords()

company - 7.589295693480497
site - 6.154209377686638
hack - 5.732488321445461
data - 5.61719372763126
information - 5.102300081302323
users - 4.790585616359165
profiles - 4.733112114488079
posted - 3.900088384979066
found - 3.099169547345169
user - 3.0297810056041614
story - 3.0182559540082154
privacy - 2.8715554831538475


### List of  Verbs  Based on Highest Rank

In [47]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos=['VERB'],window_size=4, lower=False)
tr4w.get_keywords()

posted - 3.0390615349927845
leaked - 2.60465726010101
leading - 2.570586043470419
promised - 2.19403612012987
run - 2.124636574074074
offering - 2.019007490079365
removed - 1.8640373038419913
decided - 1.847482952612863
found - 1.8376690972222223
stolen - 1.7242098124098124
motivated - 1.714590277777778
use - 1.6860680555555554


### List of  Nouns  Based on Highest Rank

In [22]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos=['NOUN'],window_size=4, lower=False)
tr4w.get_keywords()

people - 2.0375932291666663
problem - 1.3868119791666667
center - 1.1656909722222224
views - 1.1656909722222224
platform - 1.1017520833333334
confusion - 1.0
approach - 1.0
state - 1.0
discourse - 1.0
speech - 1.0
mob - 0.8823635416666668
stand - 0.8054798611111111


###  we can see the weight for each node(word) and the most important words can be used as keywords