In [None]:
import pandas as pd
import os
import re
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px

import sys
sys.path.append('../scripts')

# Import directory paths from secret config file
from config import text_directory

In [2]:
# Preprocessing the text
def preprocess(text):
    # Convert to lowercase, remove punctuation, and split into words
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

# Function to extract class type (IST / SCM) followed by course number
def extract_course_label(filename):
    match = re.search(r'([A-Z]{3})\W*(\d{3})', filename)
    if match:
        return f"{match.group(1)} {match.group(2)}"
    return filename

# Load documents and create a TF-IDF representation
def load_documents(directory):
    documents = []
    filenames = []
    for filename in os.listdir(directory):
        # Apply the regex pattern to each filename
        course_label = extract_course_label(filename)
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            documents.append(preprocess(text))
            filenames.append(course_label)
    return documents, filenames

# Calculate TF-IDF and return vectorizer and document matrix
def calculate_tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)
    return vectorizer, tfidf_matrix

# Implement search function with cosine similarity
def search(query, vectorizer, tfidf_matrix, filenames):
    query_vec = vectorizer.transform([preprocess(query)])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Ranking documents by similarity
    ranked_indices = np.argsort(cosine_similarities)[::-1]
    ranked_filenames = [(filenames[i], cosine_similarities[i]) for i in ranked_indices if cosine_similarities[i] > 0]

    return ranked_filenames

# Plot function with plotly
def plot_similarity_scores(search_results, query):
    df = pd.DataFrame(search_results, columns=['Document', 'Similarity'])
    fig = px.bar(df, x='Document', y='Similarity', 
                 title=f'Document Similarity Scores for "{query}"', 
                 color='Document',
                 color_discrete_sequence=px.colors.qualitative.G10)

    fig.update_layout(showlegend=False)
    return fig

# Apply load_documents function
documents, filenames = load_documents(text_directory)
df = pd.DataFrame({'File Names': filenames,
                    'Documents' : documents})


print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   File Names  9 non-null      object
 1   Documents   9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes
None


Unnamed: 0,File Names,Documents
0,IST 769,course syllabus ist769 advanced big data manag...
1,IST 687,1 ist687 applied data science school of inform...
2,IST 736,course syllabus ist 736 text mining 1 course l...
3,IST 707,course syllabus ist 407 707 data analytics 1 i...
4,IST 718,ist 718 big data analytics course information ...


In [3]:
# Calculate TF-IDF matrix from the documents
vectorizer, tfidf_matrix = calculate_tfidf(documents)

# Define a query term
query = "Jupyter"
print(f'Query: {query}\n')

# Search for the query in the TF-IDF matrix and get similarity scores
search_results = search(query, vectorizer, tfidf_matrix, filenames)
print(search_results)

# Create a DataFrame to display documents and their similarity scores
df_similarity = pd.DataFrame(search_results, columns=['Document', 'Similarity'])

# Output the basic information of the DataFrame
print(df_similarity.info())

# Show the first few rows of the DataFrame
df_similarity.head()

Query: Jupyter

[('IST 664', 0.06324613918486507), ('IST 652', 0.033232332165890534), ('IST 718', 0.016293171270657703), ('IST 772', 0.00979433152409285)]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Document    4 non-null      object 
 1   Similarity  4 non-null      float64
dtypes: float64(1), object(1)
memory usage: 192.0+ bytes
None


Unnamed: 0,Document,Similarity
0,IST 664,0.063246
1,IST 652,0.033232
2,IST 718,0.016293
3,IST 772,0.009794


In [4]:
# Visualization with Plotly: Plot the similarity scores for each document
fig = plot_similarity_scores(df_similarity, query)
fig