https://www.hackerrank.com/challenges/nlp-similarity-scores/problem?isFullScreen=true

You are provided with four documents, numbered 1 to 4, each with a single sentence of text. Determine the identifier of the document  which is the most similar to the first document, as computed according to the TF-IDF scores.

```txt
1. I'd like an apple.
2. An apple a day keeps the doctor away.
3. Never compare an apple to an orange.
4. I prefer scikit-learn to orange.
```

Output the integer  (which may be either 2 or 3 or 4), leaving no leading or trailing spaces.

You may either compute the answer manually and submit it in plain-text mode, or submit a program which computes the answer, in a language of your choice.

In [10]:
import math
from collections import Counter

documents = [
    "I'd like an apple.",
    "An apple a day keeps the doctor away.",
    "Never compare an apple to an orange.",
    "I prefer scikit-learn to orange."
]

def preprocess(text):
    return text.lower().replace('.', '').replace(',', '').split()

def compute_tf(word_dict, doc):  # 分别计算每个文档中, 每个词出现的频率(概率)
    tf_dict = {}
    doc_count = len(doc)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(doc_count)   # 每个词在文档中出现的次数 / 文档中的总词数 ==> 每个词出现的频率(概率)
    return tf_dict

def compute_idf(doc_list):
    idf_dict = {}
    N = len(doc_list)
    
    # 统计所有文档中, 每个词出现的次数
    for doc in doc_list:
        for word in doc:
            if word in idf_dict:
                idf_dict[word] += 1
            else:
                idf_dict[word] = 1
    
    # 计算idf, idf = log(文档总数 / 该词在所有文档中出现的次数) 
    for word, count in idf_dict.items():
        idf_dict[word] = math.log(N / float(count))   # log(文档总数 / 该词在所有文档中出现的次数)
    return idf_dict

def compute_tfidf(tf_doc, idf_dict):
    tfidf = {}
    for word, tf_val in tf_doc.items():
        tfidf[word] = tf_val * idf_dict.get(word, 0)  # 词在当前文档中的频率 * log(文档总数 / 该词在所有文档中出现的次数)
    return tfidf

def cosine_similarity(vec1, vec2):  # 余弦相似度: 内积 / (向量1的模 * 向量2的模)
    
    # 计算内积: 取相同词的tfidf值的乘积, 然后求和
    intersection = set(vec1.keys()) & set(vec2.keys())          # 使用集合的交集操作, &操作符表示交集, 得到两个文档中共同出现的词
    numerator = sum([vec1[x] * vec2[x] for x in intersection])  # 取出共同出现的词, 计算两个文档中这些词的tfidf值的乘积, 然后求和
    
    # 计算向量1的模和向量2的模: 分别取出两个文档中所有词的tfidf值的平方, 然后求和, 再开方
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    
    # 计算模的乘积
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    
    # 防止分母为0
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

preprocessed_docs = [preprocess(doc) for doc in documents]

# count words in each document, eg. {"i'd":1, "like":1, "an":1, "apple":1}
word_counts = [Counter(doc) for doc in preprocessed_docs]

tf_docs = [compute_tf(word_count, doc) for word_count, doc in zip(word_counts, preprocessed_docs)]

idf_dict = compute_idf(preprocessed_docs)

tfidf_docs = [compute_tfidf(tf_doc, idf_dict) for tf_doc in tf_docs]

similarities = [cosine_similarity(tfidf_docs[0], tfidf_docs[i]) for i in range(1, len(tfidf_docs))]

most_similar_document_index = similarities.index(max(similarities)) + 2

print(most_similar_document_index)   # print(3)


3
