In [1]:
import pandas as pd
import numpy as np
import spacy
import os
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS

  from .autonotebook import tqdm as notebook_tqdm


## Read rss data

In [2]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [3]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [4]:
len(data_list)

198

In [5]:
data_list[0].text

0    
Name: text, dtype: object

In [6]:
data_list[0].text.values

array([''], dtype=object)

In [7]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [8]:
len(all_text)

147

In [9]:
all_text_df= pd.DataFrame(all_text, columns=['text'])
all_text_df.head()

Unnamed: 0,text
0,"The 911 service as it existed until July 28, 2..."
1,"DanielMiessler Created/Updated: July 25, 2022 ..."
2,The 911 service as it exists today.For the pas...
3,DanielMiesslerMy first thought on the whole di...
4,"DanielMiesslerWell, our congressional heroes f..."


In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [14]:
all_text_df['clean'] = all_text_df.apply(lambda row:cleanData (row['text']),axis=1)

In [15]:
all_text_df.head()

Unnamed: 0,text,clean
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...


## Using TextRank4zh package to find most frequent words

https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0

In [16]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'VERB','ADJ', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [17]:
text = all_text_df['clean'][0]

In [18]:
text

'911 service exist july 28 2022.911[.]re proxy service 2015 sell access hundred thousand microsoft windows computer daily announce week shut wake data breach destroy key component business operation abrupt closure come day krebsonsecurity publish depth look 911 connection shady pay install affiliate program secretly bundle 911 proxy software title include free utility pirate software.911[.]re original residential proxy network allow rent residential ip address use relay internet communication provide anonymity advantage perceive residential user surf web.residential proxy service market people seek ability evade country specific blocking major movie medium streaming provider â\x80\x94 like 911 â\x80\x94 build network offer â\x80\x9cfree vpnâ\x80\x9d â\x80\x9cfree proxyâ\x80\x9d service power software turn userâ\x80\x99s pc traffic relay user scenario user use free vpn service unaware turn computer proxy let use internet address transact online.from websiteâ\x80\x99s perspective ip traf

# Kyword Extraction with TextRank4zh Package

### List of Highest Rank Words

In [19]:
tr4w = TextRank4Keyword()
tr4w.analyze(text,window_size=4, lower=False)
tr4w.get_keywords(10)

service - 12.568874369575667
proxy - 7.774293133718644
user - 4.582701853755747
july - 4.156244340410796
datum - 3.8030717986780562
customer - 3.5205355391649467
use - 3.418926608367013
new - 3.2606475059374556
computer - 3.2460013267892367
address - 3.238942695241574
know - 2.727081662734016
ip - 2.695437848149397


### List of  Highest Rank for Verbs and Nouns 

In [20]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos=['NOUN', 'VERB'],window_size=4, lower=False)
tr4w.get_keywords(10)

service - 10.834133659781616
proxy - 6.59074414614043
user - 4.4146032062204705
customer - 3.4240055168016377
use - 3.220025705926591
address - 3.1488852473126223
know - 2.5654090116867048
ip - 2.4780868894906845
network - 2.4721863917954887
computer - 2.467033602628897
datum - 2.4632980722888553
post - 2.2517028185806245


### List of  Verbs  Based on Highest Rank

In [21]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos=['VERB'],window_size=4, lower=False)
tr4w.get_keywords(10)

use - 2.5346879217859684
shut - 2.2684992543625353
come - 2.1427993742850515
continue - 1.9514885004559481
allow - 1.8398685206094059
operate - 1.627417509766468
add - 1.6120277002125958
pay - 1.51246369595393
include - 1.4798183123637552
bundle - 1.4081900028260708
appear - 1.3698190288299665
exist - 1.2694248386644218


### List of  Nouns  Based on Highest Rank

In [22]:
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos=['NOUN'],window_size=4, lower=False)
tr4w.get_keywords(10)

service - 10.80839185013926
proxy - 6.277617883516081
user - 4.066233068380505
customer - 3.5126879755290377
address - 3.442055613108046
computer - 2.4577695350030835
network - 2.4076994187182663
datum - 2.3898178155921483
email - 2.3468984399696864
post - 2.254806751522199
use - 2.167327136755958
install - 2.1544345238068563


###  we can see the weight for each node(word) and the most important words can be used as keywords