In [77]:
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [78]:
relevant_features = [
    'Course Title',
    'College',
    'Campus',
    'Department',
    'Schedule Type',
    'Course Topics',
    'Course Description'
]

data = pd.read_csv('data/course_data.csv', usecols=relevant_features)
data.head()

Unnamed: 0,College,Campus,Department,Course Title,Schedule Type,Course Topics,Course Description
0,College of Performing Arts,Mobility Course - Within US,Coll of Performing Arts,Professional Internship,Internship/Externship,Entrepreneurism;Management & Business;Non-Prof...,This course is intended for students in the MA...
1,College of Performing Arts,Mobility Course - Within US,Drama,Internship,Internship/Externship,Performing Arts;Theater;Interdisciplinary Arts,TBA
2,College of Performing Arts,Mobility Course - Within US,Jazz,Internship,Internship/Externship,Music Business & Technology;Musical Analysis;C...,TBA
3,College of Performing Arts,Mobility Course - Within US,Jazz,Internship,Internship/Externship,Music Business & Technology;Musical Analysis;C...,TBA
4,College of Performing Arts,Mobility Course - Within US,Mannes,Internship: Prof. Performance,Internship/Externship,Performance;Music;Performing Arts,TBA


## Preprocessing

In [79]:
# Download NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/matipina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matipina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matipina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
# Function to preprocess text
def preprocess_text(text, lemmatization = True):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [81]:
# Apply preprocessing to each description
data['Tokenized_Description'] = data['Course Description'].apply(preprocess_text)

# Filter out rows with empty lists
data = data[data['Tokenized_Description'].apply(len) > 1]

print("Original Data:")
print(data[['Course Description']])

print("\nPreprocessed Data:")
print(data[['Tokenized_Description']])

Original Data:
                                     Course Description
0     This course is intended for students in the MA...
12    This course examines the nature and qualities ...
13    Successful creative community development mani...
14    Where do personal aesthetics/ethics, practice,...
15    To graduate all MA AME students must complete ...
...                                                 ...
6565  From 1950-1960 the legendary experimental comp...
6566  This course examines the history of African Am...
6567  This transdisciplinary course will introduce s...
6568  How do we represent “missing” earlier women wh...
6569  This course will focus on how to design mental...

[4686 rows x 1 columns]

Preprocessed Data:
                                  Tokenized_Description
0     [course, intended, student, program, art, mana...
12    [course, examines, nature, quality, effective,...
13    [successful, creative, community, development,...
14    [personal, aestheticsethics, , practic

## Vectorization

In [82]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained Word2Vec model
w2v_model = api.load('word2vec-google-news-300')

# Function to convert token to vector
def token_to_vector(token):
    try:
        return w2v_model[token]
    except KeyError:
        return None  # Handle out-of-vocabulary words


# Function to calculate description vector
def description_vector(tokens):
    vectors = [token_to_vector(token) for token in tokens]
    vectors = [vec for vec in vectors if vec is not None]  # Remove None values (out-of-vocabulary tokens)
    if vectors:
        return sum(vectors) / len(vectors)  # Average of token vectors
    else:
        return None  # Handle empty token lists


In [83]:
# Calculate description vectors for each description
data['Description_Vector'] = data['Tokenized_Description'].apply(description_vector)
data = data.reset_index(drop=True)

# Calculate similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(data['Description_Vector'].to_list())

In [93]:
threshold = 0.9

# Create empty lists to store the filtered course IDs and their corresponding similarity scores for each row
filtered_course_ids = []
similarity_scores = []

# Iterate over each row of the similarity matrix
for i, row in enumerate(similarity_matrix):
    # Find the indices and similarity scores of courses with similarity scores higher than 0.75
    similar_courses_indices = np.where(row > threshold)[0]
    similar_courses_scores = row[similar_courses_indices]
    # Exclude the current course itself (if similarity score with itself is above threshold)
    similar_courses_indices = similar_courses_indices[similar_courses_indices != i]
    similar_courses_scores = row[similar_courses_indices][similar_courses_indices != i]
    # Sort similar courses by similarity score in descending order
    sorted_indices = np.argsort(similar_courses_scores)[::-1]
    similar_courses_indices = similar_courses_indices[sorted_indices]
    similar_courses_scores = similar_courses_scores[sorted_indices]
    # Append the sorted course IDs and their corresponding similarity scores to the lists
    filtered_course_ids.append(similar_courses_indices.tolist())
    similarity_scores.append(similar_courses_scores.tolist())

# Add new columns to the DataFrame containing the sorted list of filtered course IDs and their corresponding similarity scores
data['Similar_Course_Ids'] = filtered_course_ids
data['Similarity_Scores'] = similarity_scores

In [94]:
data

Unnamed: 0,College,Campus,Department,Course Title,Schedule Type,Course Topics,Course Description,Tokenized_Description,Description_Vector,Similar_Course_Ids,Similarity_Scores
0,College of Performing Arts,Mobility Course - Within US,Coll of Performing Arts,Professional Internship,Internship/Externship,Entrepreneurism;Management & Business;Non-Prof...,This course is intended for students in the MA...,"[course, intended, student, program, art, mana...","[-0.065985106, 0.017382812, 0.0013290405, 0.02...","[24, 15, 9, 10, 11, 12, 13, 14, 16, 23, 17, 18...","[1.0000001192092896, 1.0000001192092896, 1.000..."
1,College of Performing Arts,New York City,Coll of Performing Arts,Leadership and Team Building,Seminar,Entrepreneurism;Management & Business;Non-Prof...,This course examines the nature and qualities ...,"[course, examines, nature, quality, effective,...","[-0.00096049206, 0.009735107, -0.02440822, 0.0...","[211, 3246, 3245, 3244, 3243, 3242]","[0.9077481031417847, 0.9025726318359375, 0.902..."
2,College of Performing Arts,New York City,Coll of Performing Arts,Sustainable Creative Placemkng,Seminar,Arts and Social Engagement;Community Organizin...,Successful creative community development mani...,"[successful, creative, community, development,...","[-0.019480672, 0.010184708, -0.02209003, 0.077...","[3655, 3081, 3242, 3243, 3244, 3245, 3246, 222...","[0.9510039687156677, 0.9199689030647278, 0.914..."
3,College of Performing Arts,New York City,Coll of Performing Arts,Ent. in Production & Creation,Seminar,Entrepreneurism;Management & Business;Non-Prof...,"Where do personal aesthetics/ethics, practice,...","[personal, aestheticsethics, , practice, , pro...","[-0.003791222, 0.019383593, -0.009674307, 0.07...",[],[]
4,College of Performing Arts,New York City,Coll of Performing Arts,Entrepreneurial Incubator,Seminar,Entrepreneurism;Management & Business;Non-Prof...,To graduate all MA AME students must complete ...,"[graduate, ame, student, must, complete, capst...","[-0.027630484, 0.03092409, 0.00903985, 0.07792...","[4404, 4403, 4402, 1561, 1852, 4307, 2183]","[0.9176570177078247, 0.9176570177078247, 0.917..."
...,...,...,...,...,...,...,...,...,...,...,...
4681,University Curriculum,New York City,University Curriculum,John Cage and The New School,Seminar,Performance;Music,From 1950-1960 the legendary experimental comp...,"[19501960, legendary, experimental, composer, ...","[-0.05165985, 0.035454568, -0.012097135, 0.075...",[65],[1.0000003576278687]
4682,University Curriculum,New York City,University Curriculum,Blackness Through Dress,Seminar,,This course examines the history of African Am...,"[course, examines, history, african, american,...","[0.0028183346, 0.028966086, 0.046973456, 0.106...",[3939],[0.9921450614929199]
4683,University Curriculum,New York City,University Curriculum,AI for Research and Practice,Studio,,This transdisciplinary course will introduce s...,"[transdisciplinary, course, introduce, student...","[0.012151998, 0.00034424354, -0.0013180437, 0....","[3036, 4442, 4413]","[0.9157572388648987, 0.9056552648544312, 0.904..."
4684,University Curriculum,New York City,University Curriculum,Becoming Visible,Seminar,,How do we represent “missing” earlier women wh...,"[represent, “, missing, ”, earlier, woman, sys...","[0.012569316, 0.0132859675, 0.0038071848, 0.03...",[],[]


In [107]:
data.to_excel('processed_data.xlsx', index=False)
data.to_pickle('processed_data.pkl')

In [105]:
a = '[0, 9, 10]'
a.split(',')

['[0', ' 9', ' 10]']

In [103]:
list(a)

['[', '0', ',', ' ', '9', ',', ' ', '1', '0', ']']

In [101]:
data

Unnamed: 0,College,Campus,Department,Course Title,Schedule Type,Course Topics,Course Description,Tokenized_Description,Description_Vector,Similar_Course_Ids,Similarity_Scores
0,College of Performing Arts,Mobility Course - Within US,Coll of Performing Arts,Professional Internship,Internship/Externship,Entrepreneurism;Management & Business;Non-Prof...,This course is intended for students in the MA...,"[course, intended, student, program, art, mana...","[-0.065985106, 0.017382812, 0.0013290405, 0.02...","[24, 15, 9, 10, 11, 12, 13, 14, 16, 23, 17, 18...","[1.0000001192092896, 1.0000001192092896, 1.000..."
1,College of Performing Arts,New York City,Coll of Performing Arts,Leadership and Team Building,Seminar,Entrepreneurism;Management & Business;Non-Prof...,This course examines the nature and qualities ...,"[course, examines, nature, quality, effective,...","[-0.00096049206, 0.009735107, -0.02440822, 0.0...","[211, 3246, 3245, 3244, 3243, 3242]","[0.9077481031417847, 0.9025726318359375, 0.902..."
2,College of Performing Arts,New York City,Coll of Performing Arts,Sustainable Creative Placemkng,Seminar,Arts and Social Engagement;Community Organizin...,Successful creative community development mani...,"[successful, creative, community, development,...","[-0.019480672, 0.010184708, -0.02209003, 0.077...","[3655, 3081, 3242, 3243, 3244, 3245, 3246, 222...","[0.9510039687156677, 0.9199689030647278, 0.914..."
3,College of Performing Arts,New York City,Coll of Performing Arts,Ent. in Production & Creation,Seminar,Entrepreneurism;Management & Business;Non-Prof...,"Where do personal aesthetics/ethics, practice,...","[personal, aestheticsethics, , practice, , pro...","[-0.003791222, 0.019383593, -0.009674307, 0.07...",[],[]
4,College of Performing Arts,New York City,Coll of Performing Arts,Entrepreneurial Incubator,Seminar,Entrepreneurism;Management & Business;Non-Prof...,To graduate all MA AME students must complete ...,"[graduate, ame, student, must, complete, capst...","[-0.027630484, 0.03092409, 0.00903985, 0.07792...","[4404, 4403, 4402, 1561, 1852, 4307, 2183]","[0.9176570177078247, 0.9176570177078247, 0.917..."
...,...,...,...,...,...,...,...,...,...,...,...
4681,University Curriculum,New York City,University Curriculum,John Cage and The New School,Seminar,Performance;Music,From 1950-1960 the legendary experimental comp...,"[19501960, legendary, experimental, composer, ...","[-0.05165985, 0.035454568, -0.012097135, 0.075...",[65],[1.0000003576278687]
4682,University Curriculum,New York City,University Curriculum,Blackness Through Dress,Seminar,,This course examines the history of African Am...,"[course, examines, history, african, american,...","[0.0028183346, 0.028966086, 0.046973456, 0.106...",[3939],[0.9921450614929199]
4683,University Curriculum,New York City,University Curriculum,AI for Research and Practice,Studio,,This transdisciplinary course will introduce s...,"[transdisciplinary, course, introduce, student...","[0.012151998, 0.00034424354, -0.0013180437, 0....","[3036, 4442, 4413]","[0.9157572388648987, 0.9056552648544312, 0.904..."
4684,University Curriculum,New York City,University Curriculum,Becoming Visible,Seminar,,How do we represent “missing” earlier women wh...,"[represent, “, missing, ”, earlier, woman, sys...","[0.012569316, 0.0132859675, 0.0038071848, 0.03...",[],[]


In [95]:
def show_similars(data, i, limit, display_info=False):
    """
    Display similar course IDs (and optionally names and descriptions) for a given course index.
    
    Args:
    - data: DataFrame containing the course data
    - i: Index of the specific course in the DataFrame
    - limit: Maximum number of elements to return
    - display_info: If True, display names and descriptions of similar courses
    
    Returns:
    - List of similar course IDs (and optionally names and descriptions) truncated to the specified limit
    """
    similar_ids = data.at[i, 'Similar_Course_Ids'][:limit]
    if display_info:
        similar_info = []
        for course_id in similar_ids:
            course_title = data.iloc[course_id]['Course Title']
            course_description = data.iloc[course_id]['Course Description']
            similar_info.append((course_id, course_title, course_description))
        return similar_info
    else:
        return similar_ids

In [98]:
data.sample(5)

Unnamed: 0,College,Campus,Department,Course Title,Schedule Type,Course Topics,Course Description,Tokenized_Description,Description_Vector,Similar_Course_Ids,Similarity_Scores
3277,Parsons School of Design,New York City,"School of Art, Media, and Tech",Professional Internship,Internship/Externship,,This course provides credit to students indepe...,"[course, provides, credit, student, independen...","[-0.06141493, -0.014863756, 0.032002766, 0.033...","[3279, 3278, 3276, 3275, 3274, 3273, 3236, 323...","[0.9999998211860657, 0.9999998211860657, 0.999..."
2647,Parsons School of Design,New York City,Sch. Art and Dsgn Hist and Th,Fashion Cultures: Rec,Discussion,,See course description for PGFS 5030. Open ...,"[see, course, description, pgfs, 5030, open, ,...","[-0.013144531, -0.008048096, -0.014897461, 0.0...","[2649, 2648]","[1.0, 1.0]"
1183,College of Performing Arts,New York City,Mannes,Writing Projects,Independent Study,"Music History, Theory & Criticism;Musical Nota...",Each year students must prepare two article-le...,"[year, student, must, prepare, two, articlelen...","[-0.04896947, 0.023116263, 0.012877214, 0.0732...",[],[]
2826,Parsons School of Design,New York City,Sch. Art and Dsgn Hist and Th,Intro to Visual Culture: Rec,Discussion,"Visual Culture;Art History, Theory & Criticism...",See course description for PLVS 2500. Pathway:...,"[see, course, description, plvs, 2500, pathway...","[0.01965671, -0.035620794, 0.053499576, 0.1222...","[2825, 2821, 2819, 2820, 2824, 2822, 2823, 271...","[1.000000238418579, 1.000000238418579, 1.00000..."
1827,New School for Social Research,New York City,Anthropology,Anthro Workshop/Colloq,Seminar,Anthropology,The Workshop/Colloquium is a zero-credit cours...,"[workshopcolloquium, zerocredit, course, run, ...","[-0.02427903, 0.011380629, 0.018183274, 0.0462...",[],[]


In [99]:
data.iloc[2826]['Course Description']

'See course description for PLVS 2500. Pathway: Visual Studies   Open to: University undergraduate students. Pre-requisites: first-year university writing course and at least one prior history or methods course in art, media, film, or visual culture.  Co-requisite(s): PLVS 2500 Lecture '

In [100]:
# Example usage:
# Display similar courses for the course at index 0, limited to 5 elements, and also display names and descriptions
similar_courses_info = show_similars(data, 2826, limit=5, display_info=True)
for course_info in similar_courses_info:
    print("Course ID:", course_info[0])
    print("Course Title:", course_info[1])
    print("Course Description:", course_info[2])
    print()


Course ID: 2825
Course Title: Intro to Visual Culture: Rec
Course Description: See course description for PLVS 2500. Pathway: Visual Studies   Open to: University undergraduate students. Pre-requisites: first-year university writing course and at least one prior history or methods course in art, media, film, or visual culture.  Co-requisite(s): PLVS 2500 Lecture 

Course ID: 2821
Course Title: Intro to Visual Culture: Rec
Course Description: See course description for PLVS 2500. Pathway: Visual Studies   Open to: University undergraduate students. Pre-requisites: first-year university writing course and at least one prior history or methods course in art, media, film, or visual culture.  Co-requisite(s): PLVS 2500 Lecture 

Course ID: 2819
Course Title: Intro to Visual Culture: Rec
Course Description: See course description for PLVS 2500. Pathway: Visual Studies   Open to: University undergraduate students. Pre-requisites: first-year university writing course and at least one prior his