In [35]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import heapq

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""

    def __init__(self):
        self.d = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 10  # iteration steps
        self.node_weight = None  # save keywords and its weight

    def set_stopwords(self, stopwords):
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences

    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab

    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i + 1, i + window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs

    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())

    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmeric matrix
        g = self.symmetrize(g)

        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm != 0)  # this is ignore the 0 element in norm

        return g_norm

    def get_keywords(self, number=10):
        """Print top number keywords"""
#         node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
#         for i, (key, value) in enumerate(node_weight.items()):
#             #print(key + ' -- ' + str(value))
#             keywords.append(key)
#             if i > number:
#                 break

        keys = heapq.nlargest(number,self.node_weight.items(),key=lambda x:x[1])
        return [i for i, j in keys]
        
    def analyze(self, text,
                candidate_pos=['NOUN', 'PROPN'],
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""

        # Set stop words
        self.set_stopwords(stopwords)

        # Pare text by spaCy
        doc = nlp(text)

        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower)  # list of list of words

        # Build vocabulary
        vocab = self.get_vocab(sentences)

        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)

        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)

        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1 - self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        self.node_weight = node_weight

#### Test the TextRank Function 

In [36]:
text = '''
The Wandering Earth, described as China’s first big-budget science fiction thriller, quietly made it onto screens at AMC theaters in North America this weekend, and it shows a new side of Chinese filmmaking — one focused toward futuristic spectacles rather than China’s traditionally grand, massive historical epics. At the same time, The Wandering Earth feels like a throwback to a few familiar eras of American filmmaking. While the film’s cast, setting, and tone are all Chinese, longtime science fiction fans are going to see a lot on the screen that reminds them of other movies, for better or worse.
'''
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN','ADJ','VERB'], window_size=4, lower=False)
ans = tr4w.get_keywords(10)
print(ans)

['China', 'Chinese', 'science', 'fiction', 'filmmaking', 'Earth', 'reminds', 'screen', 'grand', 'throwback']


In [37]:
from pymysql import *
import collections
db = connect(host='localhost', port=3306, database='movie_all', user='root', password='794463019', charset='utf8')
cursor = db.cursor()

#### connect my database 

In [38]:
# SQL query 4822 offset 724  4363
sql = 'select title, reviews from movie_reviews_keywords t limit 4363 offset 1183 ;'
movie = collections.defaultdict(list)
try:
    # execute sql language
    cursor.execute(sql)
    # obtain records
    results = cursor.fetchall()
    for title, reviews in results:
        movie[title].append(reviews)
except:
    print("Error: unable to fecth data")

# 关闭数据库连接

#### merge reviews according to the title 

In [39]:
movies = collections.defaultdict(str)
for i,j in movie.items():
    if len(".".join(j))>1000000:
        movies[i] = ". ".join(j)[:999999]
    else:
        movies[i] = ". ".join(j)

#### extract keywords from reviews 

In [None]:
from tqdm.auto import tqdm
keywords = collections.defaultdict(str)
tr4w = TextRank4Keyword()
d = collections.defaultdict(int)
i= 0
stopwords = ['films','movies','film','movie','story','stories']
for m in tqdm(movies):
    tr4w.analyze(movies[m], candidate_pos = ['NOUN', 'PROPN','ADJ','VERB'], window_size=5, lower=False,stopwords=stopwords)
    words = tr4w.get_keywords(10)
    keywords[m] = ",".join(words)

HBox(children=(FloatProgress(value=0.0, max=4363.0), HTML(value='')))

In [None]:
#sorted(d.items(), key=lambda x:x[1], reverse=True)

In [None]:
len(keywords)

In [None]:
sql = "INSERT INTO movie_keywords (title, reviews) VALUES (%s, %s)"
val = [(key,val) for key,val in keywords.items()]
cursor.executemany(sql, val)
db.commit()
cursor.close()
db.close()