In [50]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from rake_nltk import Rake

df = pd.read_csv("FinalDF.csv")

# Target keywords
target_keywords = ["race"]

# Initialize Rake for keyword extraction
r = Rake()

df["Similarity Score"] = ""

In [51]:
# Function to find synonyms of a word using WordNet
def find_synonyms(word):
        synonyms = set()
        for synset in wn.synsets(word):
            for lemma in synset.lemmas():
                synonyms.add(lemma.name().lower())
        return synonyms

# Preprocess the keywords and text
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# Preprocessed target keywords
preprocessed_target_keywords = [stemmer.stem(word.lower())[:5] for word in target_keywords if word.lower() not in stop_words]

for i in range(df.shape[0]):
    text = df.at[i, 'Description']

    # Extract keywords from the text using RAKE
    r.extract_keywords_from_text(text)
    rake_keywords = r.get_ranked_phrases()    

    # Preprocessed RAKE keywords
    preprocessed_rake_keywords = []
    for keyword in rake_keywords:
        preprocessed_rake_keywords.extend([stemmer.stem(word.lower())[:5] for word in word_tokenize(keyword) if word.lower() not in stop_words])

    # Expand keyword list with synonyms
    expanded_keywords = set(preprocessed_rake_keywords)
    for keyword in preprocessed_rake_keywords:
        synonyms = find_synonyms(keyword)
        synonyms = [stemmer.stem(synonym)[:5] for synonym in synonyms]
        expanded_keywords.update(synonyms)

    #Preprocess the text
    preprocessed_text = [stemmer.stem(word.lower())[:5] for word in word_tokenize(text) if word.lower() not in stop_words]

    # Create vectors for the text and target keywords
    text_vector = np.array([word in preprocessed_text for word in expanded_keywords], dtype=int)
    target_vector = np.array([word in preprocessed_target_keywords for word in expanded_keywords], dtype=int)

    # Reshape vectors for compatibility
    text_vector = text_vector.reshape(1, -1)
    target_vector = target_vector.reshape(1, -1)

    # Calculate the cosine similarity between the text vector and the target keywords vector
    similarity = cosine_similarity(text_vector, target_vector)[0][0]

    # print("Cosine Similarity Score for the entire text:", similarity)
    df.at[i, 'Similarity Score'] = similarity

In [52]:
df = df.loc[~(df['Similarity Score'] == 0)]
df =df.sort_values('Similarity Score', ascending=False)
df = df.drop_duplicates('Name')
df = df.head(1)

In [53]:
df = df.drop(['Similarity Score', 'Unnamed: 0', 'Year', 'Term', 'Subject', 'Number', 'Start Time', 'End Time', 'Days of Week'], axis=1)
df

Unnamed: 0,Name,Description,Credit Hours,Degree Attributes,Type,Instructors,Class,ProfRating,GPA,Reddit_Links
353,Leisure and Consumer Culture,Examination of contemporary patterns and meani...,3 hours.,"Social & Beh Sci - Soc Sci, and Cultural Studi...",Online,"Brooks, C;Santos, C",RST 335,-1.0,3.35491,https://www.reddit.com/r/UIUC/search/?q=RST335...


In [55]:
json = df.to_json()
json

'{"Name":{"353":"Leisure and Consumer Culture"},"Description":{"353":"Examination of contemporary patterns and meanings of leisure in a consumer society. Understanding of the impact of consumption on expressions of identity, gender, social class, race and ethnicity."},"Credit Hours":{"353":"3 hours."},"Degree Attributes":{"353":"Social & Beh Sci - Soc Sci, and Cultural Studies - Western course."},"Type":{"353":"Online"},"Instructors":{"353":"Brooks, C;Santos, C"},"Class":{"353":"RST 335"},"ProfRating":{"353":-1.0},"GPA":{"353":3.3549100257},"Reddit_Links":{"353":"https:\\/\\/www.reddit.com\\/r\\/UIUC\\/search\\/?q=RST335&type=link&cId=b32ceb77-1b8c-40e5-8b34-d2b4fec8c4b6&iId=bfd40497-f1e5-4941-9509-aa6fccd80b9e"}}'