In [1]:
import pandas as pd
import numpy as np

import os, json

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


## Read rssPMData 

In [51]:
path_to_json = 'rssPMData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files)

[]


In [106]:
data_list = []
for root, dirs, files in os.walk('rssPMData'):
     for file in files:
        with open(os.path.join(root, file), "r") as auto:
            json_data = auto.read()
            #print(json_data)
            data_list.append(json_data)

In [107]:
len(data_list)

195

In [108]:
type(data_list)

list

In [216]:
# data_list[0]

In [137]:
from ast import literal_eval
tup = literal_eval(data_list[0])

In [139]:
tup['header_title']

'5 Major Software Architecture Patterns'

In [215]:
tup['text'][:566]

'Agile development methodologies are leading the way for creating the highest grade of output in the shortest amount of time within a set budget. Countless enterprises and companies regardless of their field of interest, are implementing these methodologies to reap the benefits of Agile. To help manage and visualize their work in real time, these companies use an agile project management tool. From the phase of planning till deployment, these agile tools assist in all phases of the development of the product.There are a handful of popular agile tools available.'

In [165]:
header_title = []
text = []

for i in range(len(data_list)):
    tup = literal_eval(data_list[i])
    header_title.append(tup['header_title'])
    text.append(tup['text'])

In [166]:
len(header_title)

195

In [167]:
df = pd.DataFrame()
df['header_title'] = header_title
df['text'] = text

In [169]:
df.shape

(195, 2)

In [177]:
df.isnull().sum()

header_title    0
text            0
clean           0
dtype: int64

## Cleaning

In [170]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [171]:
df['clean'] = df.apply(lambda row:cleanData (row['text']),axis=1)

In [172]:
df.head()

Unnamed: 0,header_title,text,clean
0,5 Major Software Architecture Patterns,Share This PostTable of ContentsIn the Global ...,share posttable contentsin global software arc...
1,Apiumhub as Partner of the Data Innovation Sum...,Share This PostTable of ContentsWe are excited...,share posttable contentswe excited announce se...
2,Apiumhub becomes a partner of the Global Summi...,Share This PostTable of ContentsGeekle’s onlin...,share posttable contentsgeekle online conferen...
3,DevSecOps: Future for DevOps?,Share This PostTable of ContentsDigital foster...,share posttable contentsdigital foster culture...
4,GSAS Workshop: Become a Software Design Company,Share This PostTable of ContentsOn October 3rd...,share posttable contentson october 3rd 4th 202...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [173]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [175]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(df['clean'])

In [176]:
word_count_vector.shape

(195, 8046)

In [178]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(195, 8046)

In [179]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
00,5.584967
millennial,5.584967
mileage,5.584967
mile,5.584967
midweek,5.584967


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [181]:
# count matrix 
count_vector=cv.transform(df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [182]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
#print the scores 
df_TFIDF = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df_TFIDF=df_TFIDF.sort_values(by=["tfidf"],ascending=False)

In [184]:
df_TFIDF.head(10)

Unnamed: 0,tfidf
architecture,0.51662
pattern,0.463032
microkernel,0.221597
plug,0.18994
layer,0.17615
application,0.161439
layered,0.158283
microservice,0.146792
architectural,0.146792
software,0.136826


<font color=green>the more unique a word is to our first document, the higher the score

In [185]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

# Word Extraction Based on TF-IDF

###  doc #0

### Title : '5 Major Software Architecture Patterns

In [193]:
# df['header_title'][0]

In [194]:
# Most frequent words
df_TFIDF = df_TFIDF.reset_index(names=['word', 'tfidf'])
df_TFIDF['pos'] = df_TFIDF.apply(lambda row:POS (row['word']),axis=1)

In [195]:
df_TFIDF_1 = df_TFIDF[(df_TFIDF['word'].str.len()) > 3]

In [196]:
print('Most Frequent Words Based on TF-IDF:')
df_TFIDF_1.head(10)

Most Frequent Words Based on TF-IDF:


Unnamed: 0,word,tfidf,pos
0,architecture,0.51662,NOUN
1,pattern,0.463032,NOUN
2,microkernel,0.221597,PROPN
3,plug,0.18994,VERB
4,layer,0.17615,NOUN
5,application,0.161439,NOUN
6,layered,0.158283,VERB
7,microservice,0.146792,VERB
8,architectural,0.146792,ADJ
9,software,0.136826,NOUN


In [204]:
print('Most frequent words based on NOUN and VERB')
df_TFIDF_1[(df_TFIDF_1['pos'] == 'NOUN') | (df_TFIDF_1['pos'] == 'VERB')].head(10)[['word','tfidf']]

Most frequent words based on NOUN and VERB


Unnamed: 0,word,tfidf
0,architecture,0.51662
1,pattern,0.463032
3,plug,0.18994
4,layer,0.17615
5,application,0.161439
6,layered,0.158283
7,microservice,0.146792
9,software,0.136826
10,component,0.123002
11,logic,0.105852


In [205]:
print('Most frequent words Based on NOUN')
df_TFIDF_1[df_TFIDF_1['pos']=='NOUN'].head(10)[['word','tfidf']]

Most frequent words Based on NOUN


Unnamed: 0,word,tfidf
0,architecture,0.51662
1,pattern,0.463032
4,layer,0.17615
5,application,0.161439
9,software,0.136826
10,component,0.123002
11,logic,0.105852
12,core,0.103134
13,system,0.098275
16,advantage,0.079705


In [206]:
print('Most frequent words Based on VERB')
df_TFIDF_1[df_TFIDF_1['pos']=='VERB'].head(10)[['word','tfidf']]

Most frequent words Based on VERB


Unnamed: 0,word,tfidf
3,plug,0.18994
6,layered,0.158283
7,microservice,0.146792
14,reuse,0.083183
15,architect,0.080943
17,processing,0.079389
20,design,0.065406
27,cite,0.063141
31,extended,0.052926
38,write,0.049893


## Tweeter Tags

### Unigram

In [251]:
#data = [[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]]
data11 = [['sistemasd3xd','design','vintage'],['technology','interiordesign','handmade'],['dev','travel','sale'],['python','arquitectura','art'],['mobile','home','wooden']]
Tweetr_11 = pd.DataFrame(data11, columns=[['Software','Architecture','Pattern']])

In [252]:
# hashtags on 2/22/23
Tweetr_11

Unnamed: 0,Software,Architecture,Pattern
0,sistemasd3xd,design,vintage
1,technology,interiordesign,handmade
2,dev,travel,sale
3,python,arquitectura,art
4,mobile,home,wooden


In [250]:
# data = [[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]]
data_6pm = [['technology','interiordesign','crochet'],['developer','home','vintage'],['programming','design','crochetwaistcoat'],['tech','artist','crochetpattern'],['javascript','construction','sale']]

Tweetr_6pm = pd.DataFrame(data_6pm, columns=[['Software','Architecture','Pattern']])
Tweetr_6pm

Unnamed: 0,Software,Architecture,Pattern
0,technology,interiordesign,crochet
1,developer,home,vintage
2,programming,design,crochetwaistcoat
3,tech,artist,crochetpattern
4,javascript,construction,sale


In [260]:
# One day later same time (11AM) on 2/23/23
data_11am = [['tech','interiordesign','foodart'],['technology','design','redbubble'],['infosec','home','art'],['it','france','design'],['automation','interior','geometric']]

Tweetr_11am = pd.DataFrame(data_11am, columns=[['Software','Architecture','Pattern']])
Tweetr_11am

Unnamed: 0,Software,Architecture,Pattern
0,tech,interiordesign,foodart
1,technology,design,redbubble
2,infosec,home,art
3,it,france,design
4,automation,interior,geometric


In [272]:
###Top 5 Related Hashtags
#incentivecompensation
#ycombinator
#commissions
#payments
#brightpay

### Bigram

In [269]:
# 2/28/2023, 2pm
# data = [[1,2],[3,4],[5,6],[7,8],[9,10]]
data = [['infogain','architecture'],['recognized','pattern'],['major','art'],['contender','design'],['engineering','interiordesign']]
Tweetr_2pm = pd.DataFrame(data, columns=[['Major Software','Architecture Pattern']])
Tweetr_2pm

Unnamed: 0,Major Software,Architecture Pattern
0,infogain,architecture
1,recognized,pattern
2,major,art
3,contender,design
4,engineering,interiordesign


In [271]:
# Top 5 Related Hashtags By LinkedIn
#reactjs
#reactjsbestpattern
#bestpractices
#javascripts
#hiring

### Named Entity Recognition (NER)

In [199]:
#function to NER 
# Iterate over the predicted entities
def NER(clean):
    clean = nlp(clean)
    for ent in clean.ents:
        return(ent.label_)   

In [200]:
NER(df_TFIDF_1['word'][2])

In [202]:
df_TFIDF_1['ner_tag'] = df_TFIDF_1.apply(lambda row:NER (row['word']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_TFIDF_1['ner_tag'] = df_TFIDF_1.apply(lambda row:NER (row['word']),axis=1)


In [203]:
df_TFIDF_1.head(10)

Unnamed: 0,word,tfidf,pos,ner_tag
0,architecture,0.51662,NOUN,
1,pattern,0.463032,NOUN,
2,microkernel,0.221597,PROPN,
3,plug,0.18994,VERB,
4,layer,0.17615,NOUN,
5,application,0.161439,NOUN,
6,layered,0.158283,VERB,
7,microservice,0.146792,VERB,
8,architectural,0.146792,ADJ,
9,software,0.136826,NOUN,


In [189]:
df_TFIDF_1[(df_TFIDF_1['pos'] == 'NOUN') | (df1['pos'] == 'VERB')].head(10)[['word','tfidf','pos','ner_tag']]

NameError: name 'df1' is not defined