In [1]:
# -*- coding: utf-8 -*-

from __future__ import print_function
import jieba
import codecs
from collections import defaultdict
TEXT_PATH = './Dune series/merge.txt'  # text path
DICT_PATH = './dictionary/character.txt'  # Character dictionary path
SYNONYMOUS_DICT_PATH = './dictionary/synonymous.txt'  # synonym path
SAVE_NODE_PATH = 'node.csv'
SAVE_EDGE_PATH = 'edge.csv'


class RelationshipView:
    def __init__(self, text_path, dict_path, synonymous_dict_path):
        self._text_path = text_path
        self._dict_path = dict_path
        self._synonymous_dict_path = synonymous_dict_path

        self._person_counter = defaultdict(int)
        self._person_per_paragraph = []
        self._relationships = {}
        self._synonymous_dict = {}

    def generate(self):
        self.count_person()
        self.calc_relationship()
        self.save_node_and_edge()

    def synonymous_names(self):

        with codecs.open(self._synonymous_dict_path, 'r', 'utf-8') as f:
            lines = f.read().split('\r\n')
        for l in lines:
            self._synonymous_dict[l.split(' ')[0]] = l.split(' ')[1]
        return self._synonymous_dict

    def get_clean_paragraphs(self):

        with codecs.open(self._text_path, 'r', 'utf-8') as f:
            paragraphs = f.read().split('\r\n\r\n')
        return paragraphs

    def count_person(self):

        paragraphs = self.get_clean_paragraphs()
        synonymous = self.synonymous_names()
        print('start process node')
        with codecs.open(self._dict_path, 'r', 'utf-8') as f:
            name_list = f.read().split(' 10 nr\r\n')  # Get a clean name_list
        for p in paragraphs:
            jieba.load_userdict(self._dict_path)
            # Word segmentation, initialize a new dictionary for each segment
            poss = jieba.cut(p)
            self._person_per_paragraph.append([])
            for w in poss:
                # Determine whether the name is in the dictionary and distinguish between synonyms
                if w not in name_list:
                    continue
                if synonymous.get(w):
                    w = synonymous[w]
                # Add characters to each paragraph
                self._person_per_paragraph[-1].append(w)
                # Initialize character relationships and count
                if self._person_counter.get(w) is None:
                    self._relationships[w] = {}
                self._person_counter[w] += 1
        return self._person_counter


    def calc_relationship(self):

        print("start to process edge")
        # Traverse each paragraph, form a Cartesian product, and count the relationships between characters
        for p in self._person_per_paragraph:
            for name1 in p:
                for name2 in p:
                    if name1 == name2:
                        continue
                    if self._relationships[name1].get(name2) is None:
                        self._relationships[name1][name2] = 1
                    else:
                        self._relationships[name1][name2] += 1
        return self._relationships

    def save_node_and_edge(self):

        with codecs.open(SAVE_NODE_PATH, "a+", "utf-8") as f:
            f.write("Id,Label,Weight\r\n")
            for name, times in self._person_counter.items():
                f.write(name + "," + name + "," + str(times) + "\r\n")

        with codecs.open(SAVE_EDGE_PATH, "a+", "utf-8") as f:
            f.write("Source,Target,Weight\r\n")
            for name, edges in self._relationships.items():
                for v, w in edges.items():
                    if w > 3:
                        f.write(name + "," + v + "," + str(w) + "\r\n")
        print('save file successful!')


if __name__ == '__main__':
    v = RelationshipView(TEXT_PATH, DICT_PATH, SYNONYMOUS_DICT_PATH)
    v.generate()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Shaoting\AppData\Local\Temp\jieba.cache


start process node


Loading model cost 0.579 seconds.
Prefix dict has been built successfully.


start to process edge
save file successful!
