In [125]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from rake_nltk import Rake

df = pd.read_csv("test.csv")

# Target keywords
target_keywords = ["race", "comics"]

# Initialize Rake for keyword extraction
r = Rake()

df["Similarity Score"] = ""

In [126]:
# Function to find synonyms of a word using WordNet
def find_synonyms(word):
        synonyms = set()
        for synset in wn.synsets(word):
            for lemma in synset.lemmas():
                synonyms.add(lemma.name().lower())
        return synonyms

# Preprocess the keywords and text
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# Preprocessed target keywords
preprocessed_target_keywords = [stemmer.stem(word.lower())[:5] for word in target_keywords if word.lower() not in stop_words]

for i in range(df.shape[0]):
    text = df.at[i, 'Description']

    # Extract keywords from the text using RAKE
    r.extract_keywords_from_text(text)
    rake_keywords = r.get_ranked_phrases()    

    # Preprocessed RAKE keywords
    preprocessed_rake_keywords = []
    for keyword in rake_keywords:
        preprocessed_rake_keywords.extend([stemmer.stem(word.lower())[:5] for word in word_tokenize(keyword) if word.lower() not in stop_words])

    # Expand keyword list with synonyms
    expanded_keywords = set(preprocessed_rake_keywords)
    for keyword in preprocessed_rake_keywords:
        synonyms = find_synonyms(keyword)
        synonyms = [stemmer.stem(synonym)[:5] for synonym in synonyms]
        expanded_keywords.update(synonyms)

    #Preprocess the text
    preprocessed_text = [stemmer.stem(word.lower())[:5] for word in word_tokenize(text) if word.lower() not in stop_words]

    # Create vectors for the text and target keywords
    text_vector = np.array([word in preprocessed_text for word in expanded_keywords], dtype=int)
    target_vector = np.array([word in preprocessed_target_keywords for word in expanded_keywords], dtype=int)

    # Reshape vectors for compatibility
    text_vector = text_vector.reshape(1, -1)
    target_vector = target_vector.reshape(1, -1)

    # Calculate the cosine similarity between the text vector and the target keywords vector
    similarity = cosine_similarity(text_vector, target_vector)[0][0]

    # print("Cosine Similarity Score for the entire text:", similarity)
    df.at[i, 'Similarity Score'] = similarity

In [127]:
df = df.loc[~(df['Similarity Score'] == 0)]
df =df.sort_values('Similarity Score', ascending=False)
df

Unnamed: 0.1,Unnamed: 0,Year,Term,YearTerm,Subject,Number,Name,Description,Similarity Score
967,967,2024,Spring,2024-sp,RST,335,Leisure and Consumer Culture,Examination of contemporary patterns and meani...,0.25
966,966,2024,Spring,2024-sp,RST,335,Leisure and Consumer Culture,Examination of contemporary patterns and meani...,0.25
1,1,2024,Spring,2024-sp,AAS,200,U.S. Race and Empire,Invites students to examine histories and narr...,0.223607
45,45,2024,Spring,2024-sp,ANTH,270,Language in Culture,Examines the intersections of culture and lang...,0.185695
65,65,2024,Spring,2024-sp,ARTJ,301,Manga: The Art of Image and Word,"Introduction to manga (Japanese comics), its p...",0.182574
260,260,2024,Spring,2024-sp,CMN,102,Introduction to Communication,Provides students with an overview of the majo...,0.174078
368,368,2024,Spring,2024-sp,ENGL,120,Science Fiction,"Introduction to the study of science fiction, ...",0.174078
503,503,2024,Spring,2024-sp,HIST,285,US Gender History to 1877,This course surveys the history of gender form...,0.174078
785,785,2024,Spring,2024-sp,PORT,150,Writing Brazilians into the U.S.,"Explores ethnic and race relations, gendered a...",0.171499
392,392,2024,Spring,2024-sp,EPOL,310,Race and Cultural Diversity,Study of race and cultural diversity from Colo...,0.169031
