In [1]:
import pandas as pd
import numpy as np

import os, json

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


## Read rssPMData 

In [3]:
path_to_json = 'rssPMData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files)

[]


In [4]:
data_list = []
for root, dirs, files in os.walk('rssPMData'):
     for file in files:
        with open(os.path.join(root, file), "r") as auto:
            json_data = auto.read()
            #print(json_data)
            data_list.append(json_data)

In [5]:
len(data_list)

195

In [6]:
type(data_list)

list

In [7]:
data_list[0]

'{"header_title": "5 Major Software Architecture Patterns", "date": "Tue, 02 Jun 2020 08:09:26 +0000", "text": "Share This PostTable of ContentsIn the Global Software Architecture Summit we were discussing software architecture patterns a lot and I decided to write an article about it, highlighting leading software architects insights.\\u00a0\\u00a0\\u00a0\\u00a0Architectural patterns are ways of capturing proven good design structures, so that they can be reused. Software architects have been looking for ways to capture and reuse the architectural knowledge that have proven successful in the past.\\u00a0More specifically, an architectural pattern is a package of design decisions that is found repeatedly in practice, has well defined properties that can be reused and describes a class of architectures.\\u00a0Developing an architecture can be seen as a process of selecting, tailoring, and combining patterns. The software architect must decide how to instantiate a pattern, how to make it

In [8]:
from ast import literal_eval
tup = literal_eval(data_list[0])

In [9]:
tup['header_title']

'5 Major Software Architecture Patterns'

In [10]:
# tup['text']

In [11]:
header_title = []
text = []

for i in range(len(data_list)):
    tup = literal_eval(data_list[i])
    header_title.append(tup['header_title'])
    text.append(tup['text'])

In [12]:
len(header_title)

195

In [13]:
df = pd.DataFrame()
df['header_title'] = header_title
df['text'] = text

In [14]:
df.shape

(195, 2)

In [15]:
df.isnull().sum()

header_title    0
text            0
dtype: int64

## Cleaning

In [16]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [17]:
df['clean'] = df.apply(lambda row:cleanData (row['text']),axis=1)

In [18]:
df.head()

Unnamed: 0,header_title,text,clean
0,5 Major Software Architecture Patterns,Share This PostTable of ContentsIn the Global ...,share posttable contentsin global software arc...
1,Apiumhub as Partner of the Data Innovation Sum...,Share This PostTable of ContentsWe are excited...,share posttable contentswe excited announce se...
2,Apiumhub becomes a partner of the Global Summi...,Share This PostTable of ContentsGeekle’s onlin...,share posttable contentsgeekle online conferen...
3,DevSecOps: Future for DevOps?,Share This PostTable of ContentsDigital foster...,share posttable contentsdigital foster culture...
4,GSAS Workshop: Become a Software Design Company,Share This PostTable of ContentsOn October 3rd...,share posttable contentson october 3rd 4th 202...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [19]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(df['clean'])

In [21]:
word_count_vector.shape

(195, 8046)

In [22]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(195, 8046)

In [23]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
00,5.584967
millennial,5.584967
mileage,5.584967
mile,5.584967
midweek,5.584967


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [24]:
# count matrix 
count_vector=cv.transform(df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [44]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[10] 
#print the scores 
df_TFIDF = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df_TFIDF=df_TFIDF.sort_values(by=["tfidf"],ascending=False)

In [45]:
df_TFIDF.head(10)

Unnamed: 0,tfidf
leakage,0.369051
datum,0.311054
model,0.2472
information,0.161332
production,0.154063
artificial,0.151754
machine,0.149458
ai,0.138922
cv,0.122725
base,0.117231


<font color=green>the more unique a word is to our first document, the higher the score

In [46]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

# Word Extraction Based on TF-IDF

###  doc #10

### Title : ''The Data Leakage Nightmare in AI''

In [47]:
# df['header_title'][10]

In [48]:
# Most frequent words
df_TFIDF = df_TFIDF.reset_index(names=['word', 'tfidf'])
df_TFIDF['pos'] = df_TFIDF.apply(lambda row:POS (row['word']),axis=1)

In [49]:
df_TFIDF_1 = df_TFIDF[(df_TFIDF['word'].str.len()) > 3]

In [50]:
print('Most Frequent Words Based on TF-IDF:')
df_TFIDF_1.head(10)

Most Frequent Words Based on TF-IDF:


Unnamed: 0,word,tfidf,pos
0,leakage,0.369051,NOUN
1,datum,0.311054,PROPN
2,model,0.2472,NOUN
3,information,0.161332,NOUN
4,production,0.154063,NOUN
5,artificial,0.151754,ADJ
6,machine,0.149458,NOUN
9,base,0.117231,NOUN
10,predict,0.116479,VERB
11,actor,0.113816,NOUN


In [51]:
print('Most frequent words based on NOUN and VERB')
df_TFIDF_1[(df_TFIDF_1['pos'] == 'NOUN') | (df_TFIDF_1['pos'] == 'VERB')].head(10)[['word','tfidf']]

Most frequent words based on NOUN and VERB


Unnamed: 0,word,tfidf
0,leakage,0.369051
2,model,0.2472
3,information,0.161332
4,production,0.154063
6,machine,0.149458
9,base,0.117231
10,predict,0.116479
11,actor,0.113816
12,application,0.113793
13,intelligence,0.113687


In [52]:
print('Most frequent words Based on NOUN')
df_TFIDF_1[df_TFIDF_1['pos']=='NOUN'].head(10)[['word','tfidf']]

Most frequent words Based on NOUN


Unnamed: 0,word,tfidf
0,leakage,0.369051
2,model,0.2472
3,information,0.161332
4,production,0.154063
6,machine,0.149458
9,base,0.117231
11,actor,0.113816
12,application,0.113793
13,intelligence,0.113687
16,institution,0.107494


In [53]:
print('Most frequent words Based on VERB')
df_TFIDF_1[df_TFIDF_1['pos']=='VERB'].head(10)[['word','tfidf']]

Most frequent words Based on VERB


Unnamed: 0,word,tfidf
10,predict,0.116479
14,occur,0.112041
15,training,0.108617
21,learning,0.096171
25,split,0.087359
28,underperform,0.081817
32,cite,0.081594
34,point,0.081219
36,train,0.07845
42,arise,0.070034


In [64]:
# data = [[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]]
data = [['ai','waterproofing','artificialintelligence'],['analytics','water','crypto'],['datascience','bathroom','pso2ngs'],
       ['machinelearning','chemical','machinelearning'],['artificialintelligence','roof','bigdata']]
Tweetr = pd.DataFrame(data, columns=[['Data','Leakage','AI']])
Tweetr

Unnamed: 0,Data,Leakage,AI
0,ai,waterproofing,artificialintelligence
1,analytics,water,crypto
2,datascience,bathroom,pso2ngs
3,machinelearning,chemical,machinelearning
4,artificialintelligence,roof,bigdata


In [68]:
data = [['bugbountytips',2],['bugbounty',4],['infosec',6],['ransomware',8],['cybersecurity',10]]
Tweetr = pd.DataFrame(data, columns=[['Data Leakage','Leakage Nightmare']])
Tweetr

Unnamed: 0,Data Leakage,Leakage Nightmare
0,bugbountytips,2
1,bugbounty,4
2,infosec,6
3,ransomware,8
4,cybersecurity,10


#### dose not show the results for Leakage Nightmare. Results for Nightmare AI seems very irrelevent

### Named Entity Recognition (NER)

In [199]:
#function to NER 
# Iterate over the predicted entities
def NER(clean):
    clean = nlp(clean)
    for ent in clean.ents:
        return(ent.label_)   

In [200]:
NER(df_TFIDF_1['word'][2])

In [202]:
df_TFIDF_1['ner_tag'] = df_TFIDF_1.apply(lambda row:NER (row['word']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_TFIDF_1['ner_tag'] = df_TFIDF_1.apply(lambda row:NER (row['word']),axis=1)


In [203]:
df_TFIDF_1.head(10)

Unnamed: 0,word,tfidf,pos,ner_tag
0,architecture,0.51662,NOUN,
1,pattern,0.463032,NOUN,
2,microkernel,0.221597,PROPN,
3,plug,0.18994,VERB,
4,layer,0.17615,NOUN,
5,application,0.161439,NOUN,
6,layered,0.158283,VERB,
7,microservice,0.146792,VERB,
8,architectural,0.146792,ADJ,
9,software,0.136826,NOUN,


In [189]:
df_TFIDF_1[(df_TFIDF_1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf','pos','ner_tag']]

NameError: name 'df1' is not defined