In [1959]:
# Install libraries
"""
Uncomment these the first time you run the code

# %pip install nltk
# nltk.download("stopwords")
# nltk.download('punkt')
# nltk.download('universal_tagset')
nltk.download('wordnet')
"""

import re
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from sklearn.metrics import pairwise
import numpy as np

In [1960]:
# Load the training data
def load_folder(folder):
    data = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith('.txt'):
            with open(os.path.join(folder, filename), 'r', encoding='utf-8-sig') as f:
                data.append(f.read())
        break
    return data

In [1961]:
# Load the test data
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        data = file.read()
    return [data]

In [1962]:
# Tokenize the data - sentence level
def tokenize_data(data):
    return [sent_tokenize(text) for text in data]

In [1963]:
# Convert data to lowercase
def lower_case(data):
    return [[sentence.lower() for sentence in text] for text in data]

In [1964]:
# Remove non-word characters
def remove_non_word(data):
    return [[re.sub(r'[^\w\s]', '', sentence) for sentence in text] for text in data]

In [1965]:
# Remove stop words
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    return [[word for word in text if word not in stop_words] for text in data]

In [1966]:
# Tokenize the data - word level
def tokenize_words(data):
    return [word_tokenize(sentence) for text in data for sentence in text]

In [1967]:
# Lemmatize the data
def lemmatize_data(data):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(word) for word in text] for text in data ]

In [1968]:
# Flatten the data
def flatten_data(data):
    return [word for text in data for word in text]

In [1969]:
# Get n-grams
def get_ngrams(data, n, train_flag=False):
    if train_flag:
        n_gram = []
        for text in data:  
            n_gram.append(list(ngrams(text, n)))
        return n_gram
    
    # TODO: Implement case when n = 1
    
    return list(ngrams(data, n))

In [1970]:
def one_hot_encoding(test, train):
    temp_vector = []
    one_hot_test = []

    for sentence in train:
        for word in test:
            if word in sentence:
                temp_vector.append(1)
            else:
                temp_vector.append(0)
        one_hot_test.append(temp_vector)
        temp_vector = []

    return one_hot_test

In [1971]:
# Train and test the model
def train_test_model(test):
    # Training data
    data = load_folder('./train')

    sentences_train = tokenize_data(data)
    sentences_train = lower_case(sentences_train)
    sentences_train = remove_non_word(sentences_train)

    sentences_mod_train = tokenize_words(sentences_train)
    sentences_mod_train = remove_stop_words(sentences_mod_train)
    sentences_mod_train = lemmatize_data(sentences_mod_train)

    # Testing data
    sentences_test = tokenize_data(test)
    sentences_test = lower_case(sentences_test)
    sentences_test = remove_non_word(sentences_test)

    sentences_mod_test = tokenize_words(sentences_test)
    sentences_mod_test = remove_stop_words(sentences_mod_test)
    sentences_mod_test = lemmatize_data(sentences_mod_test)

    # Data conversion
    sentences_mod_test = flatten_data(sentences_mod_test)
    sentences_mod_train = get_ngrams(sentences_mod_train, 2, True)
    sentences_mod_test = get_ngrams(sentences_mod_test, 2)

    # One hot encoding
    one_hot_test = one_hot_encoding(sentences_mod_test, sentences_mod_train)

    return one_hot_test
    

In [1972]:
file_path = './test_dummy/FID-01.txt'
test = load_file(file_path)

mat = train_test_model(test)

In [1973]:
for i in range(len(mat)):
    print(mat[i])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [1974]:
pairwise.cosine_similarity(mat)

array([[1.        , 0.        , 0.07744031, 0.22750788, 0.        ,
        0.12768848],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.07744031, 0.        , 1.        , 0.08104409, 0.        ,
        0.        ],
       [0.22750788, 0.        , 0.08104409, 1.        , 0.        ,
        0.13363062],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [0.12768848, 0.        , 0.        , 0.13363062, 0.        ,
        1.        ]])

In [1975]:
similarity_matrix = np.array(pairwise.cosine_similarity(mat))

In [1976]:
# Get the maximum similarity score from each row
max_similarity_scores = np.mean(similarity_matrix, axis=1)

# Calculate the average of the maximum similarity scores
average_max_similarity = np.max(max_similarity_scores)

print("Average of Maximum Similarity Scores:", round(average_max_similarity, 2))

Average of Maximum Similarity Scores: 0.24
