In [1]:
import pandas as pd
import numpy as np

import os, json

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


## Read rss data

In [3]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [4]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [5]:
len(data_list)

198

In [6]:
data_list[0].text

0    
Name: text, dtype: object

In [7]:
data_list[0].text.values

array([''], dtype=object)

In [9]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [10]:
len(all_text)

147

In [11]:
all_text_df= pd.DataFrame(all_text, columns=['text'])
all_text_df.head()

Unnamed: 0,text
0,"The 911 service as it existed until July 28, 2..."
1,"DanielMiessler Created/Updated: July 25, 2022 ..."
2,The 911 service as it exists today.For the pas...
3,DanielMiesslerMy first thought on the whole di...
4,"DanielMiesslerWell, our congressional heroes f..."


In [54]:
data_list[40].text.values

array(['DanielMiesslerI’m a fan of Scott Galloway. I’ve been following him since a few years before the pandemic. He’s super smart, he’s entertaining,Â\xa0and best of all he’s self-aware and vulnerable while doing so.Â\xa0But lately I’ve been getting a bad vibe. It’s been building up actually, over several months, but his analysis of the whole Elon Musk and Twitter thing has bothered me a lot. He’s basically ranting two things: 1) there’s no way he could actually buy Twitter, and 2) there’s no way he could be trying to do something good in attempting to do so.I don’t know much about the financials of buying multi-billion-dollar companies, so I leave that kind of analysis to people like Scott. It appears that, just like with Tesla stock, he was wrong. But much worse in my account was him being wrong about motivation. If you watch what Elon has actually said about why he might want to buy Twitter, and you actually listen to his answers, one thing becomes clear. He cares about the quality

In [12]:
all_text_df['text'][24]

'DanielMiesslerI’m a fan of Scott Galloway. I’ve been following him since a few years before the pandemic. He’s super smart, he’s entertaining,Â\xa0and best of all he’s self-aware and vulnerable while doing so.Â\xa0But lately I’ve been getting a bad vibe. It’s been building up actually, over several months, but his analysis of the whole Elon Musk and Twitter thing has bothered me a lot. He’s basically ranting two things: 1) there’s no way he could actually buy Twitter, and 2) there’s no way he could be trying to do something good in attempting to do so.I don’t know much about the financials of buying multi-billion-dollar companies, so I leave that kind of analysis to people like Scott. It appears that, just like with Tesla stock, he was wrong. But much worse in my account was him being wrong about motivation. If you watch what Elon has actually said about why he might want to buy Twitter, and you actually listen to his answers, one thing becomes clear. He cares about the quality of con

In [13]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [14]:
all_text_df['clean'] = all_text_df.apply(lambda row:cleanData (row['text']),axis=1)

In [15]:
all_text_df.head()

Unnamed: 0,text,clean
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [16]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(all_text_df['clean'])

In [18]:
word_count_vector.shape

(147, 8382)

In [19]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(147, 8382)

In [20]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
earli,5.304065
resolved,5.304065
george,5.304065
geopolitical,5.304065
geometry,5.304065


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [65]:
# count matrix 
count_vector=cv.transform(all_text_df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [66]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[40] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df=df.sort_values(by=["tfidf"],ascending=False)

In [67]:
df.head(10)

Unnamed: 0,tfidf
drive,0.374998
datum,0.371398
destroy,0.311673
hard,0.281039
wipe,0.231972
hdd,0.167449
destruction,0.154648
effective,0.135445
tool,0.124539
method,0.123684


<font color=green>the more unique a word is to our first document, the higher the score

In [68]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

# Word Extraction Based on TF-IDF

###  doc #24

### Title : 'Disappointed in Scott Galloway'

In [69]:
# data_list[40].header_title.values

array(['Disappointed in Scott Galloway'], dtype=object)

In [70]:
# Most frequent words
df = df.reset_index(names=['word', 'tfidf'])
df['pos'] = df.apply(lambda row:POS (row['word']),axis=1)

In [71]:
df1 = df[(df['word'].str.len()) > 3]

In [72]:
print('Most Frequent Words Based on TF-IDF:')
df1.head(10)

Most Frequent Words Based on TF-IDF:


Unnamed: 0,word,tfidf,pos
0,drive,0.374998,VERB
1,datum,0.371398,PROPN
2,destroy,0.311673,VERB
3,hard,0.281039,ADV
4,wipe,0.231972,VERB
6,destruction,0.154648,NOUN
7,effective,0.135445,ADJ
8,tool,0.124539,NOUN
9,method,0.123684,NOUN
10,retrieve,0.109175,NOUN


In [73]:
print('Most frequent words based on NOUN and VERB')
df1[(df1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf']]

Most frequent words based on NOUN and VERB


Unnamed: 0,word,tfidf
0,drive,0.374998
2,destroy,0.311673
4,wipe,0.231972
6,destruction,0.154648
8,tool,0.124539
9,method,0.123684
10,retrieve,0.109175
11,reputation,0.109175
12,business,0.108141
13,security,0.107764


In [74]:
print('Most frequent words Based on NOUN')
df1[df1['pos']=='NOUN'].head(10)[['word','tfidf']]

Most frequent words Based on NOUN


Unnamed: 0,word,tfidf
6,destruction,0.154648
8,tool,0.124539
9,method,0.123684
10,retrieve,0.109175
11,reputation,0.109175
12,business,0.108141
13,security,0.107764
20,solutions,0.077324
29,data,0.0631
31,trust,0.06227


In [75]:
print('Most frequent words Based on VERB')
df1[df1['pos']=='VERB'].head(10)[['word','tfidf']]

Most frequent words Based on VERB


Unnamed: 0,word,tfidf
0,drive,0.374998
2,destroy,0.311673
4,wipe,0.231972
14,protect,0.086474
21,file,0.074915
25,upgrade,0.069261
26,ensure,0.068811
30,breach,0.0631
32,delete,0.061842
33,estimate,0.059983


### Named Entity Recognition (NER)

In [216]:
#function to NER 
# Iterate over the predicted entities
def NER(clean):
    clean = nlp(clean)
    for ent in clean.ents:
        return(ent.label_)   

In [217]:
NER(df1['word'][2])

'NORP'

In [218]:
df1['ner_tag'] = df1.apply(lambda row:NER (row['word']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ner_tag'] = df1.apply(lambda row:NER (row['word']),axis=1)


In [219]:
df1.head(10)

Unnamed: 0,word,tfidf,pos,ner_tag
0,ashley,0.361024,PROPN,
1,madison,0.345327,PROPN,PERSON
2,nazi,0.275439,ADV,NORP
4,biderman,0.172664,NOUN,
5,hack,0.147875,VERB,
6,profile,0.146008,NOUN,
7,team,0.122921,NOUN,
8,2015,0.119916,NUM,DATE
9,impact,0.116995,NOUN,
11,user,0.103592,NOUN,


In [220]:
df1[(df1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf','pos','ner_tag']]

Unnamed: 0,word,tfidf,pos,ner_tag
4,biderman,0.172664,NOUN,
5,hack,0.147875,VERB,
6,profile,0.146008,NOUN,
7,team,0.122921,NOUN,
9,impact,0.116995,NOUN,
11,user,0.103592,NOUN,
15,company,0.089976,NOUN,
16,leak,0.08747,NOUN,
17,promise,0.083925,VERB,
20,adultery,0.078483,NOUN,
