In [1]:
import pandas as pd
import numpy as np

import os, json

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Read rss data

In [13]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [14]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [15]:
len(data_list)

198

In [16]:
data_list[0].text

0    
Name: text, dtype: object

In [17]:
data_list[0].text.values

array([''], dtype=object)

In [18]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [19]:
len(all_text)

147

In [20]:
all_text_df= pd.DataFrame(all_text, columns=['text'])
all_text_df.head()

Unnamed: 0,text
0,"The 911 service as it existed until July 28, 2..."
1,"DanielMiessler Created/Updated: July 25, 2022 ..."
2,The 911 service as it exists today.For the pas...
3,DanielMiesslerMy first thought on the whole di...
4,"DanielMiesslerWell, our congressional heroes f..."


In [21]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [22]:
all_text_df['clean'] = all_text_df.apply(lambda row:cleanData (row['text']),axis=1)

In [23]:
all_text_df.head()

Unnamed: 0,text,clean
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [24]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(all_text_df['clean'])

In [30]:
word_count_vector.shape

(147, 8382)

In [31]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(147, 8382)

In [32]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
earli,5.304065
resolved,5.304065
george,5.304065
geopolitical,5.304065
geometry,5.304065


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [35]:
# count matrix 
count_vector=count_vect.transform(all_text_df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [55]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[5] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df=df.sort_values(by=["tfidf"],ascending=False)

In [56]:
df

Unnamed: 0,tfidf
ashley,0.361024
madison,0.345327
nazi,0.275439
alm,0.204057
biderman,0.172664
...,...
hackerone,0.000000
anytime,0.000000
hacked,0.000000
hack101,0.000000


<font color=green>the more unique a word is to our first document, the higher the score

In [57]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

In [58]:
df = df.reset_index(names=['word', 'tfidf'])
df['pos'] = df.apply(lambda row:POS (row['word']),axis=1)
df

Unnamed: 0,word,tfidf,pos
0,ashley,0.361024,PROPN
1,madison,0.345327,PROPN
2,nazi,0.275439,ADV
3,alm,0.204057,NOUN
4,biderman,0.172664,NOUN
...,...,...,...
8377,hackerone,0.000000,ADV
8378,anytime,0.000000,ADV
8379,hacked,0.000000,VERB
8380,hack101,0.000000,PUNCT


In [60]:
### pos_tag choice
df[df['pos']=='NOUN']

Unnamed: 0,word,tfidf,pos
3,alm,0.204057,NOUN
4,biderman,0.172664,NOUN
6,profile,0.146008,NOUN
7,team,0.122921,NOUN
9,impact,0.116995,NOUN
...,...,...,...
8367,half,0.000000,NOUN
8372,hactivism,0.000000,NOUN
8374,hackforum,0.000000,NOUN
8375,hackersâ,0.000000,NOUN


In [61]:
### pos_tag choice
df[df['pos']=='VERB']

Unnamed: 0,word,tfidf,pos
5,hack,0.147875,VERB
17,promise,0.083925,VERB
22,mention,0.073554,VERB
29,duckduckgo,0.062787,VERB
40,delete,0.057971,VERB
...,...,...,...
8354,habit,0.000000,VERB
8363,halt,0.000000,VERB
8371,had,0.000000,VERB
8373,hacking,0.000000,VERB
