In [1]:
# Install libraries
"""
Uncomment these the first time you run the code

# %pip install nltk
# nltk.download("stopwords")
# nltk.download('punkt')
# nltk.download('universal_tagset')
nltk.download('wordnet')
"""

import re
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from sklearn.metrics import pairwise

In [2]:
# Load the training data
def load_folder(folder):
    data = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith('.txt'):
            with open(os.path.join(folder, filename), 'r', encoding='utf-8-sig') as f:
                data.append(f.read())
    return data

In [3]:
# Load the test data
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        data = file.read()
    return [data]

In [4]:
# Tokenize the data - sentence level and return the sentence boundaries
def tokenize_data(data):
    tokenized_data = []
    text_enum = {}
    global_count = 0
    for i, text in enumerate(data):
        tokenized_text = sent_tokenize(text)
        tokenized_data.append(tokenized_text)

        # Enumerate the sentence boundaries
        count = 0
        for _ in tokenized_text:
            text_enum[global_count] = [i, count]
            
            count += 1
            global_count += 1

    return tokenized_data, text_enum

    # return [sent_tokenize(text) for text in data]

In [5]:
# Convert data to lowercase
def lower_case(data):
    return [[sentence.lower() for sentence in text] for text in data]

In [6]:
# Remove non-word characters
def remove_non_word(data):
    return [[re.sub(r'[^\w\s]', '', sentence) for sentence in text] for text in data]

In [7]:
# Remove stop words
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    return [[word for word in text if word not in stop_words] for text in data]

In [8]:
# Tokenize the data - word level
def tokenize_words(data):
    return [word_tokenize(sentence) for text in data for sentence in text]

In [9]:
# Lemmatize the data
def lemmatize_data(data):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(word) for word in text] for text in data ]

In [10]:
# Flatten the data
def flatten_data(data):
    return [word for text in data for word in text]

In [11]:
# Get n-grams
def get_ngrams(data, n, train_flag=False):
    if train_flag:
        n_gram = []
        for text in data:  
            n_gram.append(list(ngrams(text, n)))
        return n_gram
    
    # TODO: Implement case when n = 1
    
    return list(ngrams(data, n))

In [12]:
def one_hot_encoding(corpus, sentences):
    temp_vector = []
    one_hot_test = []

    for sentence in sentences:
        for word in corpus:
            if word in sentence:
                temp_vector.append(1)
            else:
                temp_vector.append(0)
        one_hot_test.append(temp_vector)
        temp_vector = []
    return one_hot_test

In [13]:
# Train and test the model
def train_test_model(test):
    # Training data
    data = load_folder('./train')

    sentences_train, sentences_train_enum = tokenize_data(data)
    sentences_train = lower_case(sentences_train)
    sentences_train = remove_non_word(sentences_train)

    sentences_mod_train = tokenize_words(sentences_train)
    sentences_mod_train = remove_stop_words(sentences_mod_train)
    sentences_mod_train = lemmatize_data(sentences_mod_train)

    # Testing data
    sentences_test, sentences_test_enum = tokenize_data(test)

    sentences_test = lower_case(sentences_test)
    sentences_test = remove_non_word(sentences_test)

    sentences_mod_test = tokenize_words(sentences_test)
    sentences_mod_test = remove_stop_words(sentences_mod_test)
    sentences_mod_test = lemmatize_data(sentences_mod_test)

    # Data conversion
    sentences_mod_train_corpus = flatten_data(sentences_mod_train)
    sentences_mod_train = get_ngrams(sentences_mod_train, 2, True)
    sentences_mod_test = get_ngrams(sentences_mod_test, 2, True)
    sentences_mod_train_corpus = get_ngrams(sentences_mod_train_corpus, 2)

    print(sentences_test_enum)
    print(sentences_train_enum)

    # One hot encoding
    one_hot_test_train = one_hot_encoding(sentences_mod_train_corpus, sentences_mod_train)
    one_hot_test_test = one_hot_encoding(sentences_mod_train_corpus, sentences_mod_test)

    return one_hot_test_train, one_hot_test_test, sentences_train_enum, sentences_test_enum, sentences_mod_train, sentences_mod_test

In [17]:
file_path = './test_dummy/FID-01.txt'
test = load_file(file_path)

mat_train, mat_test, train_enum, test_enum, train_sentences, test_sentences = train_test_model(test)

{0: [0, 0], 1: [0, 1], 2: [0, 2], 3: [0, 3], 4: [0, 4], 5: [0, 5]}
{0: [0, 0], 1: [0, 1], 2: [0, 2], 3: [0, 3], 4: [0, 4], 5: [0, 5], 6: [1, 0], 7: [1, 1], 8: [1, 2], 9: [1, 3], 10: [1, 4], 11: [1, 5], 12: [1, 6], 13: [2, 0], 14: [2, 1], 15: [2, 2], 16: [2, 3], 17: [2, 4], 18: [2, 5], 19: [2, 6], 20: [2, 7], 21: [3, 0], 22: [3, 1], 23: [3, 2], 24: [3, 3], 25: [3, 4], 26: [3, 5], 27: [3, 6], 28: [3, 7], 29: [3, 8], 30: [3, 9], 31: [3, 10], 32: [3, 11], 33: [3, 12], 34: [3, 13], 35: [3, 14], 36: [4, 0], 37: [4, 1], 38: [4, 2], 39: [4, 3], 40: [4, 4], 41: [5, 0], 42: [5, 1], 43: [5, 2], 44: [5, 3], 45: [6, 0], 46: [6, 1], 47: [6, 2], 48: [6, 3], 49: [6, 4], 50: [6, 5], 51: [7, 0], 52: [7, 1], 53: [7, 2], 54: [8, 0], 55: [8, 1], 56: [8, 2], 57: [8, 3], 58: [8, 4], 59: [8, 5], 60: [8, 6], 61: [8, 7], 62: [9, 0], 63: [9, 1], 64: [9, 2], 65: [9, 3], 66: [9, 4], 67: [9, 5], 68: [9, 6], 69: [10, 0], 70: [10, 1], 71: [10, 2], 72: [10, 3], 73: [10, 4], 74: [10, 5], 75: [11, 0], 76: [11, 1], 77: [

In [18]:
# loop through the test data and calculate the cosine similarity
count = 0
for i in range(len(mat_train)):
    for j in range(len(mat_test)):
        cosine_similarity = pairwise.cosine_similarity([mat_test[j]], [mat_train[i]])
        if cosine_similarity[0][0] > 0.5:
            count += 1
            print(f"Similarity detected:\nFID-{(test_enum[j][0])+1}.txt sentence {(test_enum[j][1]+1)} vs org-{(train_enum[i][0])+1}.txt sentence {(train_enum[i][1])+1}: {round((cosine_similarity[0][0])*100, 2)}%")
            print(f"FID-{(test_enum[j][0])+1}.txt: {test_sentences[j]}")
            print(f"org-{(train_enum[i][0])+1}.txt: {train_sentences[i]}")
            print()

print(f"Total number of similar sentences: {count}")

Similarity detected:
FID-1.txt sentence 1 vs org-1.txt sentence 1: 100.0%
FID-1.txt: [('study', 'provided'), ('provided', 'content'), ('content', 'analysis'), ('analysis', 'study'), ('study', 'aiming'), ('aiming', 'disclose'), ('disclose', 'artificial'), ('artificial', 'intelligence'), ('intelligence', 'ai'), ('ai', 'applied'), ('applied', 'education'), ('education', 'sector'), ('sector', 'explore'), ('explore', 'potential'), ('potential', 'research'), ('research', 'trend'), ('trend', 'challenge'), ('challenge', 'ai'), ('ai', 'education')]
org-1.txt: [('study', 'provided'), ('provided', 'content'), ('content', 'analysis'), ('analysis', 'study'), ('study', 'aiming'), ('aiming', 'disclose'), ('disclose', 'artificial'), ('artificial', 'intelligence'), ('intelligence', 'ai'), ('ai', 'applied'), ('applied', 'education'), ('education', 'sector'), ('sector', 'explore'), ('explore', 'potential'), ('potential', 'research'), ('research', 'trend'), ('trend', 'challenge'), ('challenge', 'ai'), ('a