In [2]:
import pandas as pd
import numpy as np

import os, json

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


## Read rssPMData 

In [3]:
path_to_json = 'rssPMData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files)

[]


In [4]:
data_list = []
for root, dirs, files in os.walk('rssPMData'):
     for file in files:
        with open(os.path.join(root, file), "r") as auto:
            json_data = auto.read()
            #print(json_data)
            data_list.append(json_data)

In [5]:
len(data_list)

195

In [6]:
type(data_list)

list

In [7]:
data_list[0]

'{"header_title": "5 Major Software Architecture Patterns", "date": "Tue, 02 Jun 2020 08:09:26 +0000", "text": "Share This PostTable of ContentsIn the Global Software Architecture Summit we were discussing software architecture patterns a lot and I decided to write an article about it, highlighting leading software architects insights.\\u00a0\\u00a0\\u00a0\\u00a0Architectural patterns are ways of capturing proven good design structures, so that they can be reused. Software architects have been looking for ways to capture and reuse the architectural knowledge that have proven successful in the past.\\u00a0More specifically, an architectural pattern is a package of design decisions that is found repeatedly in practice, has well defined properties that can be reused and describes a class of architectures.\\u00a0Developing an architecture can be seen as a process of selecting, tailoring, and combining patterns. The software architect must decide how to instantiate a pattern, how to make it

In [8]:
from ast import literal_eval
tup = literal_eval(data_list[0])

In [9]:
tup['header_title']

'5 Major Software Architecture Patterns'

In [10]:
# tup['text']

In [11]:
header_title = []
text = []

for i in range(len(data_list)):
    tup = literal_eval(data_list[i])
    header_title.append(tup['header_title'])
    text.append(tup['text'])

In [12]:
len(header_title)

195

In [13]:
df = pd.DataFrame()
df['header_title'] = header_title
df['text'] = text

In [14]:
df.shape

(195, 2)

In [15]:
df.isnull().sum()

header_title    0
text            0
dtype: int64

## Cleaning

In [16]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [17]:
df['clean'] = df.apply(lambda row:cleanData (row['text']),axis=1)

In [18]:
df.head()

Unnamed: 0,header_title,text,clean
0,5 Major Software Architecture Patterns,Share This PostTable of ContentsIn the Global ...,share posttable contentsin global software arc...
1,Apiumhub as Partner of the Data Innovation Sum...,Share This PostTable of ContentsWe are excited...,share posttable contentswe excited announce se...
2,Apiumhub becomes a partner of the Global Summi...,Share This PostTable of ContentsGeekle’s onlin...,share posttable contentsgeekle online conferen...
3,DevSecOps: Future for DevOps?,Share This PostTable of ContentsDigital foster...,share posttable contentsdigital foster culture...
4,GSAS Workshop: Become a Software Design Company,Share This PostTable of ContentsOn October 3rd...,share posttable contentson october 3rd 4th 202...


## Implement NLP techniques

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y-Zq4XbMI2w

### Calculate IDF

In [19]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in all docs 
word_count_vector=cv.fit_transform(df['clean'])

In [21]:
word_count_vector.shape

(195, 8046)

In [22]:
# Compute idf weights
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
all_text_df_tfidf = tfidf_transformer.fit_transform(word_count_vector)
all_text_df_tfidf.shape

(195, 8046)

In [23]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'],ascending=False).head()

Unnamed: 0,idf_weights
00,5.584967
millennial,5.584967
mileage,5.584967
mile,5.584967
midweek,5.584967


<font color=green> The higher idf_weights means most unique word in any particular doc</font>

### Calculate TF-IDF

In [24]:
# count matrix 
count_vector=cv.transform(df['clean']) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [25]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[35] 
#print the scores 
df_TFIDF = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df_TFIDF=df_TFIDF.sort_values(by=["tfidf"],ascending=False)

In [26]:
df_TFIDF.head(10)

Unnamed: 0,tfidf
cs,0.692541
customer,0.306312
productivity,0.260394
team,0.199045
op,0.16621
success,0.162014
csm,0.145582
nifty,0.110505
platform,0.103746
chevron_right,0.092087


<font color=green>the more unique a word is to our first document, the higher the score

In [27]:
def POS(clean):
    clean = nlp(clean)
    for token in clean:
        return(token.pos_) 

# Word Extraction Based on TF-IDF

###  doc #35

### Title : '7 Tips to Increase the Productivity of Customer Success Team'

In [29]:
# df['header_title'][35]

In [30]:
# Most frequent words
df_TFIDF = df_TFIDF.reset_index(names=['word', 'tfidf'])
df_TFIDF['pos'] = df_TFIDF.apply(lambda row:POS (row['word']),axis=1)

In [31]:
df_TFIDF_1 = df_TFIDF[(df_TFIDF['word'].str.len()) > 3]

In [32]:
print('Most Frequent Words Based on TF-IDF:')
df_TFIDF_1.head(10)

Most Frequent Words Based on TF-IDF:


Unnamed: 0,word,tfidf,pos
1,customer,0.306312,NOUN
2,productivity,0.260394,NOUN
3,team,0.199045,NOUN
5,success,0.162014,NOUN
7,nifty,0.110505,ADJ
8,platform,0.103746,NOUN
9,chevron_right,0.092087,PROPN
10,saas,0.083105,NOUN
11,workflow,0.081893,VERB
12,metric,0.075258,ADJ


In [33]:
print('Most frequent words based on NOUN and VERB')
df_TFIDF_1[(df_TFIDF_1['pos'] == 'NOUN') | (df_TFIDF_1['pos'] == 'VERB')].head(10)[['word','tfidf']]

Most frequent words based on NOUN and VERB


Unnamed: 0,word,tfidf
1,customer,0.306312
2,productivity,0.260394
3,team,0.199045
5,success,0.162014
8,platform,0.103746
10,saas,0.083105
11,workflow,0.081893
14,portfolio,0.063232
15,customers,0.060724
16,automate,0.060696


In [34]:
print('Most frequent words Based on NOUN')
df_TFIDF_1[df_TFIDF_1['pos']=='NOUN'].head(10)[['word','tfidf']]

Most frequent words Based on NOUN


Unnamed: 0,word,tfidf
1,customer,0.306312
2,productivity,0.260394
3,team,0.199045
5,success,0.162014
8,platform,0.103746
10,saas,0.083105
14,portfolio,0.063232
15,customers,0.060724
17,tool,0.058787
18,manual,0.05415


In [35]:
print('Most frequent words Based on VERB')
df_TFIDF_1[df_TFIDF_1['pos']=='VERB'].head(10)[['word','tfidf']]

Most frequent words Based on VERB


Unnamed: 0,word,tfidf
11,workflow,0.081893
16,automate,0.060696
23,care,0.048842
24,help,0.047733
25,boost,0.046763
31,read,0.042761
34,take,0.039165
39,result,0.035728
44,stay,0.033521
48,inspire,0.032077


In [48]:
#data = [[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]]
data = [['business','customerexperience','motivation'],['efficiency','business','business'],['leadership','customerservice','leadership'],['ai','customers','inspiration'],['innovation','sales','entrepreneur']]
Tweeter = pd.DataFrame(data, columns=[['Productivity','Customer','Success']])
Tweeter

Unnamed: 0,Productivity,Customer,Success
0,business,customerexperience,motivation
1,efficiency,business,business
2,leadership,customerservice,leadership
3,ai,customers,inspiration
4,innovation,sales,entrepreneur


In [11]:
# 2/28/2023, 2pm
data = [['increase','customer','success'],['less','success','team'],['boost','business','business'],['save','marketing','leadership'],['one','service','teamwork']]
Tweetr_2pm = pd.DataFrame(data, columns=[['Increase Productivity','Customer Success','Success Team']])
Tweetr_2pm

Unnamed: 0,Increase Productivity,Customer Success,Success Team
0,increase,customer,success
1,less,success,team
2,boost,business,business
3,save,marketing,leadership
4,one,service,teamwork


### Named Entity Recognition (NER)

In [36]:
#function to NER 
# Iterate over the predicted entities
def NER(clean):
    clean = nlp(clean)
    for ent in clean.ents:
        return(ent.label_)   

In [37]:
NER(df_TFIDF_1['word'][2])

In [38]:
df_TFIDF_1['ner_tag'] = df_TFIDF_1.apply(lambda row:NER (row['word']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_TFIDF_1['ner_tag'] = df_TFIDF_1.apply(lambda row:NER (row['word']),axis=1)


In [39]:
df_TFIDF_1.head(10)

Unnamed: 0,word,tfidf,pos,ner_tag
1,customer,0.306312,NOUN,
2,productivity,0.260394,NOUN,
3,team,0.199045,NOUN,
5,success,0.162014,NOUN,
7,nifty,0.110505,ADJ,
8,platform,0.103746,NOUN,
9,chevron_right,0.092087,PROPN,
10,saas,0.083105,NOUN,
11,workflow,0.081893,VERB,
12,metric,0.075258,ADJ,


In [41]:
df_TFIDF_1[(df_TFIDF_1['pos'] == 'NOUN') | (df_TFIDF_1['pos'] == 'VERB')].head(10)[['word','tfidf','pos','ner_tag']]

Unnamed: 0,word,tfidf,pos,ner_tag
1,customer,0.306312,NOUN,
2,productivity,0.260394,NOUN,
3,team,0.199045,NOUN,
5,success,0.162014,NOUN,
8,platform,0.103746,NOUN,
10,saas,0.083105,NOUN,
11,workflow,0.081893,VERB,
14,portfolio,0.063232,NOUN,
15,customers,0.060724,NOUN,
16,automate,0.060696,VERB,
