## TF-IDF<br />
TF（词频）：该词在文章中出现的次数/该文章的总词数<br />
IDF（反文档频率）：Log(总文章数/包含该词的文章数+1)
> * step1：原始数据预处理
> * step2：产生IDF词表
> * step3：计算TF-IDF

In [31]:
import math
import os

file_path_dir = "./d0101/data"
raw_path = "./d0101/raw.data"
idf_path = './d0101/idf.data'
file_index = 1
word_idf_dict = {}
try:
    file_writer = open(raw_path, "w", encoding="utf-8")
    for file in os.scandir(file_path_dir):
        lines = []
        with open(file, "r", encoding="utf-8") as read:
            for line in read:
                lines.append(line.strip())
        content = '\t'.join([str(file_index), ' '.join(lines)]) + '\n'
        file_writer.writelines(content)
        file_index += 1
finally:
    if file_writer:
        file_writer.close()

In [32]:
try:
    file_words = []
    file_writer = open(idf_path, "w", encoding="utf-8")
    with open(raw_path, "r", encoding="utf-8") as read:
        for line in read:
            line = line.strip().split("\t")
            if len(line) != 2:
                continue
            words = line[1].strip().split(" ")
            words = set(words)
            for word in words:
                file_words.append((word, 1))
    file_words = sorted(file_words, key=lambda x: x[0])
    word_count = 0
    current_word = None
    for word in file_words:
        w, n = word
        if not current_word:
            current_word = w
        if not current_word.__eq__(w):
            idf = math.log(file_index / (word_count + 1))
            word_content = "\t".join([current_word, str(idf)]) + "\n"
            file_writer.write(word_content)
            current_word = w
            word_count = 0
            continue
        word_count += n
    idf = math.log(file_index / (word_count + 1.0))
    word_content = "\t".join([current_word, str(idf)]) + "\n"
    file_writer.write(word_content)
finally:
    if file_writer:
        file_writer.close()

In [33]:
with open(idf_path, "r", encoding="utf-8") as read:
    for line in read:
        line = line.strip().split("\t")
        if len(line) != 2:
            continue
        word, idf_score = line
        word_idf_dict[word] = idf_score


def get_tf_idf(target):
    word_tf_dict = {}
    for word in target.strip().split(" "):
        if word not in word_tf_dict:
            word_tf_dict[word] = 1
        else:
            word_tf_dict[word] += 1
    for k, v in word_tf_dict.items():
        if k not in word_idf_dict:
            continue
        tf_idf = v * float(word_idf_dict[k])
        yield k, tf_idf


for k, v in get_tf_idf("我们 带来 阿里巴巴 希望 差"):
    print(k, v)

我们 1.0828948727727
带来 1.4623844944776037
阿里巴巴 2.892353919100156
希望 1.9478923102593046
差 2.1992067385402105



## 相似度
cosine：A·B/|A||B|
jaccard：|A∩B|/|A∪B|
> * step1：cosine相似度
> * step2：jaccard相似度

In [34]:
source = "我们 带来 阿里巴巴"
target = "我们 带来 阿里巴巴"
def cosine(source, target):
    t1_dict = {}
    sum = 0.
    for k, v in get_tf_idf(source):
        sum += pow(v, 2)
    sum = math.sqrt(sum)
    for k, v in get_tf_idf(source):
        t1_dict[k] = float(v / sum)

    sum = 0.
    for k, v in get_tf_idf(target):
        sum += pow(v, 2)
    sum = math.sqrt(sum)

    final_score = 0.
    for k, v in get_tf_idf(target):
        if k not in t1_dict:
            continue
        s1 = t1_dict[k]
        s2 = float(v / sum)

        final_score += s1 * s2
    return final_score
print(cosine(source, target))

1.0


In [35]:
source = "我们 带来 阿里巴巴"
target = "我们 带来 阿里巴巴"
def jaccard(source, target):
    s1_set = set(source.strip().split(' '))
    s2_set = set(target.strip().split(' '))

    score = 0.
    s1_s2_join = s1_set & s2_set
    len1 = len(s1_s2_join)

    s1_s2_union = s1_set | s2_set
    len2 = len(s1_s2_union)

    return float(len1) / float(len2)

print(jaccard(source, target))

1.0
