In [1]:
import pandas as pd
import numpy as np

import os, json

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


## Read rss data

In [3]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [4]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [5]:
len(data_list)

198

In [6]:
data_list[0].text

0    
Name: text, dtype: object

In [7]:
data_list[0].text.values

array([''], dtype=object)

In [8]:
data_list[2]

Unnamed: 0,header_title,date,text,url
0,911 Proxy Service Implodes After Disclosing Br...,2022-07-29 19:34:45+00:00,"The 911 service as it existed until July 28, 2...",https://krebsonsecurity.com/2022/07/911-proxy-...


In [9]:
all_data=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
        header = list(data_list[i].header_title)
#         print(type(text))
        all_data.append(text)
        all_data.append(header)

In [11]:
all_data[0]

['The 911 service as it existed until July 28, 2022.911[.]re, a proxy service that since 2015 has sold access to hundreds of thousands of Microsoft Windows computers daily, announced this week that it is shutting down in the wake of a data breach that destroyed key components of its business operations. The abrupt closure comes ten days after KrebsOnSecurity published an in-depth look at 911 and its connections to shady pay-per-install affiliate programs that secretly bundled 911’s proxy software with other titles, including “free” utilities and pirated software.911[.]re is was one of the original “residential proxy” networks, which allow someone to rent a residential IP address to use as a relay for his/her Internet communications, providing anonymity and the advantage of being perceived as a residential user surfing the web.Residential proxy services are often marketed to people seeking the ability to evade country-specific blocking by the major movie and media streaming providers. B

In [12]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [14]:
all_text[0]

['The 911 service as it existed until July 28, 2022.911[.]re, a proxy service that since 2015 has sold access to hundreds of thousands of Microsoft Windows computers daily, announced this week that it is shutting down in the wake of a data breach that destroyed key components of its business operations. The abrupt closure comes ten days after KrebsOnSecurity published an in-depth look at 911 and its connections to shady pay-per-install affiliate programs that secretly bundled 911’s proxy software with other titles, including “free” utilities and pirated software.911[.]re is was one of the original “residential proxy” networks, which allow someone to rent a residential IP address to use as a relay for his/her Internet communications, providing anonymity and the advantage of being perceived as a residential user surfing the web.Residential proxy services are often marketed to people seeking the ability to evade country-specific blocking by the major movie and media streaming providers. B

In [165]:
len(all_text)

147

In [166]:
all_text_df= pd.DataFrame(all_text, columns=['text'])
all_text_df.head()

Unnamed: 0,text
0,"The 911 service as it existed until July 28, 2..."
1,"DanielMiessler Created/Updated: July 25, 2022 ..."
2,The 911 service as it exists today.For the pas...
3,DanielMiesslerMy first thought on the whole di...
4,"DanielMiesslerWell, our congressional heroes f..."


In [221]:
all_text_df['text'][0]

'The 911 service as it existed until July 28, 2022.911[.]re, a proxy service that since 2015 has sold access to hundreds of thousands of Microsoft Windows computers daily, announced this week that it is shutting down in the wake of a data breach that destroyed key components of its business operations. The abrupt closure comes ten days after KrebsOnSecurity published an in-depth look at 911 and its connections to shady pay-per-install affiliate programs that secretly bundled 911’s proxy software with other titles, including “free” utilities and pirated software.911[.]re is was one of the original “residential proxy” networks, which allow someone to rent a residential IP address to use as a relay for his/her Internet communications, providing anonymity and the advantage of being perceived as a residential user surfing the web.Residential proxy services are often marketed to people seeking the ability to evade country-specific blocking by the major movie and media streaming providers. Bu

In [167]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [168]:
all_text_df['clean'] = all_text_df.apply(lambda row:cleanData (row['text']),axis=1)

In [169]:
all_text_df.head()

Unnamed: 0,text,clean
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [170]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [171]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(all_text_df['clean'])

In [172]:
word_count_vector.shape

(147, 8382)

In [173]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(147, 8382)

In [174]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
earli,5.304065
resolved,5.304065
george,5.304065
geopolitical,5.304065
geometry,5.304065


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [175]:
# count matrix 
count_vector=cv.transform(all_text_df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [208]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[5] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df=df.sort_values(by=["tfidf"],ascending=False)

In [209]:
df.head(10)

Unnamed: 0,tfidf
ashley,0.361024
madison,0.345327
nazi,0.275439
alm,0.204057
biderman,0.172664
hack,0.147875
profile,0.146008
team,0.122921
2015,0.119916
impact,0.116995


<font color=green>the more unique a word is to our first document, the higher the score

In [210]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

# Word Extraction Based on TF-IDF

In [211]:
# Most frequent words
df = df.reset_index(names=['word', 'tfidf'])
df['pos'] = df.apply(lambda row:POS (row['word']),axis=1)

In [212]:
df1 = df[(df['word'].str.len()) > 3]
df1.head(10)

Unnamed: 0,word,tfidf,pos
0,ashley,0.361024,PROPN
1,madison,0.345327,PROPN
2,nazi,0.275439,ADV
4,biderman,0.172664,NOUN
5,hack,0.147875,VERB
6,profile,0.146008,NOUN
7,team,0.122921,NOUN
8,2015,0.119916,NUM
9,impact,0.116995,NOUN
11,user,0.103592,NOUN


In [213]:
# Most frequent words based on NOUN and VERB
df1[(df1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf']]

Unnamed: 0,word,tfidf
4,biderman,0.172664
5,hack,0.147875
6,profile,0.146008
7,team,0.122921
9,impact,0.116995
11,user,0.103592
15,company,0.089976
16,leak,0.08747
17,promise,0.083925
20,adultery,0.078483


In [214]:
### pos_tag choice : 
df1[df1['pos']=='NOUN'].head(10)[['word','tfidf']]

Unnamed: 0,word,tfidf
4,biderman,0.172664
6,profile,0.146008
7,team,0.122921
9,impact,0.116995
11,user,0.103592
15,company,0.089976
16,leak,0.08747
20,adultery,0.078483
21,female,0.074673
25,email,0.065374


In [215]:
### pos_tag choice
df1[df1['pos']=='VERB'].head(10)[['word','tfidf']]

Unnamed: 0,word,tfidf
5,hack,0.147875
17,promise,0.083925
22,mention,0.073554
29,duckduckgo,0.062787
40,delete,0.057971
41,lead,0.057555
45,believe,0.052429
49,imagine,0.048636
60,think,0.04505
61,remove,0.044949


### Named Entity Recognition (NER)

In [216]:
#function to NER 
# Iterate over the predicted entities
def NER(clean):
    clean = nlp(clean)
    for ent in clean.ents:
        return(ent.label_)   

In [217]:
NER(df1['word'][2])

'NORP'

In [218]:
df1['ner_tag'] = df1.apply(lambda row:NER (row['word']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ner_tag'] = df1.apply(lambda row:NER (row['word']),axis=1)


In [219]:
df1.head(10)

Unnamed: 0,word,tfidf,pos,ner_tag
0,ashley,0.361024,PROPN,
1,madison,0.345327,PROPN,PERSON
2,nazi,0.275439,ADV,NORP
4,biderman,0.172664,NOUN,
5,hack,0.147875,VERB,
6,profile,0.146008,NOUN,
7,team,0.122921,NOUN,
8,2015,0.119916,NUM,DATE
9,impact,0.116995,NOUN,
11,user,0.103592,NOUN,


In [220]:
df1[(df1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf','pos','ner_tag']]

Unnamed: 0,word,tfidf,pos,ner_tag
4,biderman,0.172664,NOUN,
5,hack,0.147875,VERB,
6,profile,0.146008,NOUN,
7,team,0.122921,NOUN,
9,impact,0.116995,NOUN,
11,user,0.103592,NOUN,
15,company,0.089976,NOUN,
16,leak,0.08747,NOUN,
17,promise,0.083925,VERB,
20,adultery,0.078483,NOUN,
