In [40]:
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [41]:
relevant_features = [
    'Term ',
    'CRN',
    'Section',
    'Course Title',
    'College',
    'Campus',
    'Department',
    'Schedule Type',
    'Course Topics',
    'Course Description',
    'Primary Faculty'
]

data = pd.read_csv('data/course_data.csv', usecols=relevant_features)
data = data.drop_duplicates()
data.head()

Unnamed: 0,Term,College,Campus,Department,CRN,Section,Course Title,Schedule Type,Course Topics,Course Description,Primary Faculty
0,202330: Spring 2024,College of Performing Arts,Mobility Course - Within US,Coll of Performing Arts,12473,A,Professional Internship,Internship/Externship,Entrepreneurism;Management & Business;Non-Prof...,This course is intended for students in the MA...,"Brown-Fried, Stephen"
1,202330: Spring 2024,College of Performing Arts,Mobility Course - Within US,Drama,12900,A,Internship,Internship/Externship,Performing Arts;Theater;Interdisciplinary Arts,TBA,"Brown-Fried, Stephen"
2,202330: Spring 2024,College of Performing Arts,Mobility Course - Within US,Jazz,9999,A,Internship,Internship/Externship,Music Business & Technology;Musical Analysis;C...,TBA,"Brown-Fried, Stephen"
3,202330: Spring 2024,College of Performing Arts,Mobility Course - Within US,Jazz,10488,B,Internship,Internship/Externship,Music Business & Technology;Musical Analysis;C...,TBA,"Brown-Fried, Stephen"
4,202330: Spring 2024,College of Performing Arts,Mobility Course - Within US,Mannes,10496,A,Internship: Prof. Performance,Internship/Externship,Performance;Music;Performing Arts,TBA,"Brown-Fried, Stephen"


In [42]:
data = data[data['Schedule Type'] != 'Internship/Externship']

## Preprocessing

In [43]:
# Download NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/matipina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matipina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matipina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
# Function to preprocess text
def preprocess_text(text, lemmatization = True):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [45]:
# Apply preprocessing to each description
data['Tokenized_Description'] = data['Course Description'].apply(preprocess_text)

# Filter out rows with empty lists
data = data[data['Tokenized_Description'].apply(len) > 1]

print("Original Data:")
print(data[['Course Description']])

print("\nPreprocessed Data:")
print(data[['Tokenized_Description']])

Original Data:
                                     Course Description
12    This course examines the nature and qualities ...
13    Successful creative community development mani...
14    Where do personal aesthetics/ethics, practice,...
15    To graduate all MA AME students must complete ...
16    This class will focus on how the elements of a...
...                                                 ...
6565  From 1950-1960 the legendary experimental comp...
6566  This course examines the history of African Am...
6567  This transdisciplinary course will introduce s...
6568  How do we represent “missing” earlier women wh...
6569  This course will focus on how to design mental...

[4616 rows x 1 columns]

Preprocessed Data:
                                  Tokenized_Description
12    [course, examines, nature, quality, effective,...
13    [successful, creative, community, development,...
14    [personal, aestheticsethics, , practice, , pro...
15    [graduate, ame, student, must, complet

## Vectorization

In [46]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained Word2Vec model
w2v_model = api.load('word2vec-google-news-300')

# Function to convert token to vector
def token_to_vector(token):
    try:
        return w2v_model[token]
    except KeyError:
        return None  # Handle out-of-vocabulary words


# Function to calculate description vector
def description_vector(tokens):
    vectors = [token_to_vector(token) for token in tokens]
    vectors = [vec for vec in vectors if vec is not None]  # Remove None values (out-of-vocabulary tokens)
    if vectors:
        return sum(vectors) / len(vectors)  # Average of token vectors
    else:
        return None  # Handle empty token lists


In [47]:
# Calculate description vectors for each description
data['Description_Vector'] = data['Tokenized_Description'].apply(description_vector)
data = data.reset_index(drop=True)

# Calculate similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(data['Description_Vector'].to_list())

In [48]:
threshold = 0.9

# Create empty lists to store the filtered course IDs and their corresponding similarity scores for each row
filtered_course_ids = []
similarity_scores = []

# Iterate over each row of the similarity matrix
for i, row in enumerate(similarity_matrix):
    # Find the indices and similarity scores of courses with similarity scores higher than 0.75
    similar_courses_indices = np.where(row > threshold)[0]
    similar_courses_scores = row[similar_courses_indices]
    # Exclude the current course itself (if similarity score with itself is above threshold)
    similar_courses_indices = similar_courses_indices[similar_courses_indices != i]
    similar_courses_scores = row[similar_courses_indices][similar_courses_indices != i]
    # Sort similar courses by similarity score in descending order
    sorted_indices = np.argsort(similar_courses_scores)[::-1]
    similar_courses_indices = similar_courses_indices[sorted_indices]
    similar_courses_scores = similar_courses_scores[sorted_indices]
    # Append the sorted course IDs and their corresponding similarity scores to the lists
    filtered_course_ids.append(similar_courses_indices.tolist())
    similarity_scores.append(similar_courses_scores.tolist())

# Add new columns to the DataFrame containing the sorted list of filtered course IDs and their corresponding similarity scores
data['Similar_Course_Ids'] = filtered_course_ids
data['Similarity_Scores'] = similarity_scores

In [49]:
data

Unnamed: 0,Term,College,Campus,Department,CRN,Section,Course Title,Schedule Type,Course Topics,Course Description,Primary Faculty,Tokenized_Description,Description_Vector,Similar_Course_Ids,Similarity_Scores
0,202330: Spring 2024,College of Performing Arts,New York City,Coll of Performing Arts,3368,A,Leadership and Team Building,Seminar,Entrepreneurism;Management & Business;Non-Prof...,This course examines the nature and qualities ...,"Watts, Rachel","[course, examines, nature, quality, effective,...","[-0.00096049206, 0.009735107, -0.02440822, 0.0...","[193, 3188, 3187, 3186, 3185, 3184]","[0.9077481031417847, 0.9025726318359375, 0.902..."
1,202330: Spring 2024,College of Performing Arts,New York City,Coll of Performing Arts,9481,A,Sustainable Creative Placemkng,Seminar,Arts and Social Engagement;Community Organizin...,Successful creative community development mani...,,"[successful, creative, community, development,...","[-0.019480672, 0.010184708, -0.02209003, 0.077...","[3585, 3044, 3184, 3185, 3186, 3187, 3188, 220...","[0.9510039687156677, 0.9199689030647278, 0.914..."
2,202330: Spring 2024,College of Performing Arts,New York City,Coll of Performing Arts,3367,A,Ent. in Production & Creation,Seminar,Entrepreneurism;Management & Business;Non-Prof...,"Where do personal aesthetics/ethics, practice,...","Koozer, Margaret","[personal, aestheticsethics, , practice, , pro...","[-0.003791222, 0.019383593, -0.009674307, 0.07...",[],[]
3,202330: Spring 2024,College of Performing Arts,New York City,Coll of Performing Arts,3930,A,Entrepreneurial Incubator,Seminar,Entrepreneurism;Management & Business;Non-Prof...,To graduate all MA AME students must complete ...,"Helguera, Pablo","[graduate, ame, student, must, complete, capst...","[-0.027630484, 0.03092409, 0.00903985, 0.07792...","[4334, 4333, 4332, 1542, 1833, 4237, 2158]","[0.9176570177078247, 0.9176570177078247, 0.917..."
4,202330: Spring 2024,College of Performing Arts,New York City,Coll of Performing Arts,5316,A,Arts Mgmt. & Governance,Seminar,,This class will focus on how the elements of a...,"Connolly, Aidan","[class, focus, element, art, management, , str...","[-0.029670566, 0.018237544, -8.691526e-05, 0.0...","[3658, 3657, 3656, 3514, 3513, 3512, 3511, 351...","[0.9111335277557373, 0.9111335277557373, 0.911..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4611,202330: Spring 2024,University Curriculum,New York City,University Curriculum,8983,A,John Cage and The New School,Seminar,Performance;Music,From 1950-1960 the legendary experimental comp...,"La Barbara, Joan","[19501960, legendary, experimental, composer, ...","[-0.05165985, 0.035454568, -0.012097135, 0.075...",[47],[1.0000003576278687]
4612,202330: Spring 2024,University Curriculum,New York City,University Curriculum,13213,A,Blackness Through Dress,Seminar,,This course examines the history of African Am...,,"[course, examines, history, african, american,...","[0.0028183346, 0.028966086, 0.046973456, 0.106...",[3869],[0.9921450614929199]
4613,202330: Spring 2024,University Curriculum,New York City,University Curriculum,14718,A,AI for Research and Practice,Studio,,This transdisciplinary course will introduce s...,"Bechtel, Mark","[transdisciplinary, course, introduce, student...","[0.012151998, 0.00034424354, -0.0013180437, 0....","[2999, 4372, 4343]","[0.9157572388648987, 0.9056552648544312, 0.904..."
4614,202330: Spring 2024,University Curriculum,New York City,University Curriculum,9730,A,Becoming Visible,Seminar,,How do we represent “missing” earlier women wh...,"Walker, Gina","[represent, “, missing, ”, earlier, woman, sys...","[0.012569316, 0.0132859675, 0.0038071848, 0.03...",[],[]


In [50]:
#data.to_excel('processed_data.xlsx', index=False)
data.to_pickle('processed_data.pkl')