<h2>TextRank multiple with TFIDF</h2>

In [69]:
import pandas as pd
import os
data = pd.read_csv(os.path.join("dataprep1.csv"))
df = data

In [71]:
word = pd.read_csv(os.path.join("word.csv"))
dfword = word

<p>TextRank</p>

In [3]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key+","+str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

<p>TF-IDF</p>

In [67]:
from operator import itemgetter
class tfIDF():
    def __init__(self,doc):
        self.doc = [x.split() for x in doc]
        self.wordSet = self.getWordset(self.doc)
        self.wordDict = self.getWorddict(self.doc)
        self.idf = self.computeIDF(self.wordDict)
        self.tfidf = []

    def getWordset(self,doc):
        wordSet = []
        for x in doc:
            wordSet = wordSet + x
        return set(wordSet)

    def getWorddict(self,doc):
        wordDict = []
        for i in range(0,len(doc)):
            wordDict.append(dict.fromkeys(self.wordSet, 0))
        for i in range(0,len(wordDict)):
            for word in doc[i]:
                wordDict[i][word]+=1
        return wordDict

    def computeIDF(self,docList):
        import math
        idfDict = {}
        N = len(docList)

        idfDict = dict.fromkeys(docList[0].keys(), 0)
        for doc in docList:
            for word, val in doc.items():
                if val > 0:
                    idfDict[word] += 1

        for word, val in idfDict.items():
            idfDict[word] = math.log10(N / float(val))

        return idfDict    
    
    def computeTFIDF(self, i, idfs):
        tfidf = {}
        for word in self.doc[i]:
            tfidf[word] = self.wordDict[i][word]*idfs[word]
        return tfidf
    
    def getTFIDF(self):
        idfs = self.idf
        tfidf_out = []
        for x in range(0,len(self.doc)):
            tfidf_out.append(self.computeTFIDF(x, idfs))
        self.tfidf = tfidf_out
        return list(tfidf_out)

    def getSCORE(self):
        score = []
        for u in range(0,len(self.tfidf)):
            score.append(0)
            for i in self.tfidf[u]:
                score[u] += self.tfidf[u][i]
        return score
    
    def getIDF(self):
        return self.idf

    def getWORDSET(self):
        return self.wordSet
    
    def getWORDDICT(self):
        return self.wordDict
    
    def search(self,query):
        query_tfidf = []

        num = 0
        tfidf_out = self.getTFIDF()
        for i in tfidf_out:
            query_tfidf.append(0.0)
            for u in i:
                if u in query:
                    query_tfidf[num] += i[u]
            num = num + 1
        
        hasil = [a*b for a,b in zip(query_tfidf,self.getSCORE())]
        for i in range(0,len(hasil)):
            hasil[i] = [i, hasil[i]]
        
        hasil_akhir = reversed(sorted(hasil, key=itemgetter(1)))
        hasil_akhir = [x for x in hasil_akhir]
        
        return list(hasil_akhir[:1])

In [49]:
tfidf = tfIDF(df['0'])

In [76]:
for i in range(len(word)):
    print(tfidf.search(dfword['word'][i])[:1])
    

[[0, 28.34513125990071]]
[[114, 120.04745488928397]]
[[122, 33.613053719493465]]
[[152, 47.87416146108058]]
[[15, 73.9936972825433]]
[[15, 73.9936972825433]]
[[46, 35.013580099481665]]
[[130, 431.4444381325112]]
[[15, 73.9936972825433]]
[[128, 156.88608469344754]]
[[3, 12.820580273374892]]
[[122, 33.613053719493465]]
[[128, 156.88608469344754]]
[[7, 11.915106654694512]]
[[130, 211.3450884010087]]
[[128, 156.88608469344754]]
[[7, 11.915106654694512]]
[[116, 160.51831704620395]]
[[128, 156.88608469344754]]
[[7, 11.915106654694512]]
[[130, 431.4444381325112]]
[[116, 160.51831704620395]]
[[110, 43.51038517047771]]
[[134, 39.624485562472174]]
[[7, 11.915106654694512]]
[[130, 431.4444381325112]]
[[15, 73.9936972825433]]
[[128, 156.88608469344754]]
[[7, 11.915106654694512]]
[[114, 120.04745488928397]]
[[128, 156.88608469344754]]
[[113, 193.59161620439]]
[[15, 73.9936972825433]]
[[128, 156.88608469344754]]
[[113, 193.59161620439]]
[[11, 17.112958455091665]]
[[128, 156.88608469344754]]
[[12, 15

In [89]:
tr4w = TextRank4Keyword()
req = 1
for i in range(len(data)):
    print("Requirement", req ,":")
    req=req+1
    tr4w.analyze(df['0'][i], candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    tr4w.get_keywords(1)
    print("\n")

Requirement 1 :
associate,1.281397222222222
deadline,1.281397222222222
job,1.0959555555555553


Requirement 2 :
cost,1.281397222222222
unit,1.281397222222222
cluster,1.0959555555555553


Requirement 3 :
monitor,1.0814583333333332
status,1.0814583333333332
submit,1.0814583333333332


Requirement 4 :
user,1.0
cancel,1.0
job,1.0


Requirement 5 :
user,1.0
check,1.0
credit,1.0


Requirement 6 :
user,1.0
check,1.0
usage,1.0


Requirement 7 :
user,1.0
check,1.0
status,1.0


Requirement 8 :
usage,1.1900458333333335
pattern,1.1900458333333335
history,1.0044979166666668


Requirement 9 :
check,1.0814583333333332
status,1.0814583333333332
submit,1.0814583333333332


Requirement 10 :
user,1.0
check,1.0
load,1.0


Requirement 11 :
user,1.0
alters,1.0
structure,1.0


Requirement 12 :
user,1.0
alters,1.0
policy,1.0


Requirement 13 :
user,1.0
photo,1.0
student,1.0


Requirement 14 :
process,1.3603645833333333
rup,1.0168229166666665
development,0.81140625


Requirement 15 :
waterfall,1.0
process,1.0
