In [1]:
import pandas as pd
import numpy as np

import os, json

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


## Read rss data

In [3]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [4]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [5]:
len(data_list)

198

In [6]:
data_list[0].text

0    
Name: text, dtype: object

In [7]:
data_list[0].text.values

array([''], dtype=object)

In [8]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [9]:
len(all_text)

147

In [10]:
all_text_df= pd.DataFrame(all_text, columns=['text'])
all_text_df.head()

Unnamed: 0,text
0,"The 911 service as it existed until July 28, 2..."
1,"DanielMiessler Created/Updated: July 25, 2022 ..."
2,The 911 service as it exists today.For the pas...
3,DanielMiesslerMy first thought on the whole di...
4,"DanielMiesslerWell, our congressional heroes f..."


In [58]:
data_list[59].text.values

array(['The U.S. Securities and Exchange Commission (SEC) is taking steps to crack down on insufficient cyber risk reporting.Related: Making third-party risk audits actionableSeeking to minimize cybersecurity threat effects, the SEC has proposed several amendments requiring organizations to report on cyber risk in a â\x80\x9cfast, comparable, and decision-useful manner.â\x80\x9dWorryingly, threats are beginning to outpace organizations’ ability to effectively prevent and respond to them. Leaders are no longer as confident in their organizationâ\x80\x99s cyber resilience, and employees often lack awareness.The SEC, in essence, is compelling businesses, public companies and large investment firms to better prepare for inevitable cyber attacks. The new rules urge companies to build more robust cyber risk management programs.This should provide better visibility into the impact of cyber risk and demonstrate the adequacy of risk mitigation investments.Many organizations base their risk miti

In [12]:
all_text_df['text'][38]

'The U.S. Securities and Exchange Commission (SEC) is taking steps to crack down on insufficient cyber risk reporting.Related: Making third-party risk audits actionableSeeking to minimize cybersecurity threat effects, the SEC has proposed several amendments requiring organizations to report on cyber risk in a â\x80\x9cfast, comparable, and decision-useful manner.â\x80\x9dWorryingly, threats are beginning to outpace organizations’ ability to effectively prevent and respond to them. Leaders are no longer as confident in their organizationâ\x80\x99s cyber resilience, and employees often lack awareness.The SEC, in essence, is compelling businesses, public companies and large investment firms to better prepare for inevitable cyber attacks. The new rules urge companies to build more robust cyber risk management programs.This should provide better visibility into the impact of cyber risk and demonstrate the adequacy of risk mitigation investments.Many organizations base their risk mitigation 

In [13]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [14]:
all_text_df['clean'] = all_text_df.apply(lambda row:cleanData (row['text']),axis=1)

In [15]:
all_text_df.head()

Unnamed: 0,text,clean
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [59]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(all_text_df['clean'])

In [61]:
word_count_vector.shape

(147, 8382)

In [62]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(147, 8382)

In [63]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
earli,5.304065
resolved,5.304065
george,5.304065
geopolitical,5.304065
geometry,5.304065


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [64]:
# count matrix 
count_vector=cv.transform(all_text_df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [65]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[38] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df=df.sort_values(by=["tfidf"],ascending=False)

In [66]:
df.head(10)

Unnamed: 0,tfidf
cyber,0.518997
risk,0.484299
organization,0.218721
sec,0.207151
fair,0.161841
amendment,0.144542
loss,0.134551
board,0.129469
cybersecurity,0.111593
assess,0.103575


<font color=green>the more unique a word is to our first document, the higher the score

In [67]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

# Word Extraction Based on TF-IDF

###  doc #38

### Title : 'GUEST ESSAY: New SEC rules aim to help C-levels, board members quantify cyber risks'

In [57]:
# data_list[59].header_title.values

In [69]:
# Most frequent words
df = df.reset_index(names=['word', 'tfidf'])
df['pos'] = df.apply(lambda row:POS (row['word']),axis=1)

In [70]:
df1 = df[(df['word'].str.len()) > 3]

In [71]:
print('Most Frequent Words Based on TF-IDF:')
df1.head(10)

Most Frequent Words Based on TF-IDF:


Unnamed: 0,word,tfidf,pos
0,cyber,0.518997,NOUN
1,risk,0.484299,NOUN
2,organization,0.218721,NOUN
4,fair,0.161841,ADJ
5,amendment,0.144542,NOUN
6,loss,0.134551,NOUN
7,board,0.129469,NOUN
8,cybersecurity,0.111593,NOUN
9,assess,0.103575,NOUN
10,exposure,0.103575,NOUN


In [72]:
print('Most frequent words based on NOUN and VERB')
df1[(df1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf']]

Most frequent words based on NOUN and VERB


Unnamed: 0,word,tfidf
0,cyber,0.518997
1,risk,0.484299
2,organization,0.218721
5,amendment,0.144542
6,loss,0.134551
7,board,0.129469
8,cybersecurity,0.111593
9,assess,0.103575
10,exposure,0.103575
11,report,0.103303


In [73]:
print('Most frequent words Based on NOUN')
df1[df1['pos']=='NOUN'].head(10)[['word','tfidf']]

Most frequent words Based on NOUN


Unnamed: 0,word,tfidf
0,cyber,0.518997
1,risk,0.484299
2,organization,0.218721
5,amendment,0.144542
6,loss,0.134551
7,board,0.129469
8,cybersecurity,0.111593
9,assess,0.103575
10,exposure,0.103575
11,report,0.103303


In [74]:
print('Most frequent words Based on VERB')
df1[df1['pos']=='VERB'].head(10)[['word','tfidf']]

Most frequent words Based on VERB


Unnamed: 0,word,tfidf
12,propose,0.093904
20,reporting,0.06541
21,demonstrate,0.06541
23,disclose,0.063723
31,determine,0.060765
34,quantify,0.057817
35,require,0.057641
47,maintain,0.047816
49,analyze,0.04624
59,understand,0.033776


### Named Entity Recognition (NER)

In [216]:
#function to NER 
# Iterate over the predicted entities
def NER(clean):
    clean = nlp(clean)
    for ent in clean.ents:
        return(ent.label_)   

In [217]:
NER(df1['word'][2])

'NORP'

In [218]:
df1['ner_tag'] = df1.apply(lambda row:NER (row['word']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ner_tag'] = df1.apply(lambda row:NER (row['word']),axis=1)


In [219]:
df1.head(10)

Unnamed: 0,word,tfidf,pos,ner_tag
0,ashley,0.361024,PROPN,
1,madison,0.345327,PROPN,PERSON
2,nazi,0.275439,ADV,NORP
4,biderman,0.172664,NOUN,
5,hack,0.147875,VERB,
6,profile,0.146008,NOUN,
7,team,0.122921,NOUN,
8,2015,0.119916,NUM,DATE
9,impact,0.116995,NOUN,
11,user,0.103592,NOUN,


In [220]:
df1[(df1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf','pos','ner_tag']]

Unnamed: 0,word,tfidf,pos,ner_tag
4,biderman,0.172664,NOUN,
5,hack,0.147875,VERB,
6,profile,0.146008,NOUN,
7,team,0.122921,NOUN,
9,impact,0.116995,NOUN,
11,user,0.103592,NOUN,
15,company,0.089976,NOUN,
16,leak,0.08747,NOUN,
17,promise,0.083925,VERB,
20,adultery,0.078483,NOUN,
