In [1]:
!pip install underthesea

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting underthesea
  Downloading underthesea-6.0.1-py3-none-any.whl (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==0.0.5a2
  Downloading underthesea_core-0.0.5_alpha.2-cp38-cp38-manylinux2010_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.3/591.3 KB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6
  Downloading python_crfsuite-0.9.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.8 underthesea-6.0.1 underthesea-core-0.0.5a2


In [2]:
!gdown https://drive.google.com/uc?id=12D06zcJzNi5BepCpTDsyE0MdwVCKmsy8&export=download

Downloading...
From: https://drive.google.com/uc?id=12D06zcJzNi5BepCpTDsyE0MdwVCKmsy8
To: /content/vietnamese-stopwords.txt
  0% 0.00/20.5k [00:00<?, ?B/s]100% 20.5k/20.5k [00:00<00:00, 17.5MB/s]


In [3]:
!gdown https://drive.google.com/uc?id=1ZAC3mLjoWprkvRTR61xmtBn9OseorpCh&export=download

Downloading...
From: https://drive.google.com/uc?id=1ZAC3mLjoWprkvRTR61xmtBn9OseorpCh
To: /content/newpaper.csv
  0% 0.00/1.29M [00:00<?, ?B/s]100% 1.29M/1.29M [00:00<00:00, 112MB/s]


In [4]:
import string
import pandas as pd
from underthesea import word_tokenize

In [5]:
data = pd.read_csv('/content/newpaper.csv')
base_document = data['Document'][0]
documents = data['Document'][:]

In [6]:
def stopwords():
    sw = []
    with open("/content/vietnamese-stopwords.txt", encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        sw.append(line.replace("\n",""))
    return sw

def preprocess(text):
	# Steps:
	# 1. lowercase
	# 2. Lammetize. (It does not stem. Try to preserve structure not to overwrap with potential acronym).
	# 3. Remove stop words.
	# 4. Remove punctuations.
	# 5. Remove character with the length size of 1.

	lowered = str.lower(text)

	stop_words = set(stopwords())
	word_tokens = word_tokenize(lowered)

	words = []
	for w in word_tokens:
		if w not in stop_words:
			if w not in string.punctuation:
				if len(w) > 1:
					words.append(w)

	return words


In [8]:
def calculate_jaccard(word_tokens1, word_tokens2):
	# Combine both tokens to find union.
	both_tokens = word_tokens1 + word_tokens2
	union = set(both_tokens)

	# Calculate intersection.
	intersection = set()
	for w in word_tokens1:
		if w in word_tokens2:
			intersection.add(w)

	jaccard_score = len(intersection)/len(union)
	return jaccard_score

def process_jaccard_similarity():

	# Tokenize the base document we are comparing against.
	base_tokens = preprocess(base_document)

	# Tokenize each document
	all_tokens = []
	for i, document in enumerate(documents):
		tokens = preprocess(str(document))
		all_tokens.append(tokens)


	all_scores = []
	for tokens in all_tokens:
		score = calculate_jaccard(base_tokens, tokens)
		all_scores.append(score)

	pairs = []
	for i, score in enumerate(all_scores):
		pairs.append({'index': i, 'score': all_scores[i]})
	pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
	return list(pairs)

process_jaccard_similarity()

[{'index': 0, 'score': 1.0},
 {'index': 2, 'score': 0.327683615819209},
 {'index': 4, 'score': 0.1615720524017467},
 {'index': 5, 'score': 0.12448132780082988},
 {'index': 43, 'score': 0.11654135338345864},
 {'index': 75, 'score': 0.0963302752293578},
 {'index': 66, 'score': 0.09539473684210527},
 {'index': 216, 'score': 0.08791208791208792},
 {'index': 17, 'score': 0.07987220447284345},
 {'index': 35, 'score': 0.07936507936507936},
 {'index': 72, 'score': 0.07936507936507936},
 {'index': 45, 'score': 0.07929515418502203},
 {'index': 230, 'score': 0.07777777777777778},
 {'index': 298, 'score': 0.07213114754098361},
 {'index': 165, 'score': 0.07194244604316546},
 {'index': 283, 'score': 0.06987951807228916},
 {'index': 74, 'score': 0.06938775510204082},
 {'index': 217, 'score': 0.06907894736842106},
 {'index': 212, 'score': 0.06896551724137931},
 {'index': 143, 'score': 0.06859205776173286},
 {'index': 89, 'score': 0.06818181818181818},
 {'index': 71, 'score': 0.06692913385826772},
 {'i