In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import FastText
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from huggingface_hub import hf_hub_download
from gensim.models import KeyedVectors
import fasttext
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
courses_df = pd.read_csv('take-home-2/courses.csv')

courses_df = courses_df.fillna('')
courses_df['text'] = courses_df[['preprocessed_title', 'preprocessed_description', 'preprocessed_department_name', 'preprocessed_topics']].values.tolist()

# topics and department names are separated by comma, we don't want that in the text
courses_df['text'] = [' '.join(t).replace(', ', '') for t in courses_df['text']]

# Build Vectors & Models

In [None]:
# BoW
def bow_embeddings(corpus):
    count_vectorizer = CountVectorizer()
    embeddings_bow = count_vectorizer.fit_transform(corpus)
    return embeddings_bow, count_vectorizer

# TF-IDF
def tfidf_embeddings(corpus):
    tfidf_vectorizer = TfidfVectorizer()
    embeddings_tfidf = tfidf_vectorizer.fit_transform(corpus)
    return embeddings_tfidf, tfidf_vectorizer

# Word2Vec
def word2vec_model(corpus):
    kv = KeyedVectors.load_word2vec_format("take-home-2/wiki-news-300d-1M.vec")
    
    # only retrieving necessary tokens, to save memory
    all_tokens = set()
    
    for text in corpus:
        tokens = nltk.word_tokenize(text)
        all_tokens.update(tokens)
        
    filtered_tokens = [token for token in all_tokens if token in kv]
    
    kv_subset = KeyedVectors(vector_size=kv.vector_size)
    kv_subset.add_vectors(filtered_tokens, [kv[token] for token in filtered_tokens])
    return kv_subset
    
# FastText
def fasttext_model(corpus):
    model = FastText(corpus, vector_size=100, window=5, min_count=1, workers=-1)
    
    return model.wv

    # Wanted to use this, but not enough RAM
    
    # model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
    # ft_model = fasttext.load_model(model_path)
    
    # vector_size = ft_model.get_dimension()
    # kv = KeyedVectors(vector_size=vector_size)
    
    # all_tokens = set()
    
    # for text in corpus:
    #     tokens = nltk.word_tokenize(text)
    #     all_tokens.update(tokens)

    # word_vectors = []
    # words_in_model = []
    # for token in all_tokens:
    #     try:
    #         word_vectors.append(ft_model.get_word_vector(token))
    #         words_in_model.append(token)
    #     except:
    #         pass

    # kv.add_vectors(words_in_model, word_vectors)
    # return kv

# Sentence Transformers
def sentence_transformers_embeddings(corpus):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model.encode(corpus), model

In [22]:
# search courses similar to this given course

courses_df.loc[1797, ['title', 'description', 'department_name', 'topics']].to_dict()

{'title': 'Machine Learning',
 'description': '6.867 is an introductory course on machine learning which gives an overview of many concepts, techniques, and algorithms in machine learning, beginning with topics such as classification and linear regression and ending up with more recent topics such as boosting, support vector machines, hidden Markov models, and Bayesian networks. The course will give the student the basic ideas and intuition behind modern machine learning methods as well as a bit more formal understanding of how, why, and when they work. The underlying theme in the course is statistical inference as it provides the foundation for most of the methods covered.',
 'department_name': 'Electrical Engineering and Computer Science',
 'topics': 'Engineering, Science, Computer Science, Artificial Intelligence, Mathematics, Probability and Statistics, Cognitive Science, Algorithms and Data Structures'}

In [24]:
courses_df.loc[1797, ['text']].to_dict()

{'text': 'machine learning 6867 introductory course machine learning give overview many concept technique algorithm machine learning beginning topic classification linear regression ending recent topic boosting support vector machine hidden markov model bayesian network course give student basic idea intuition behind modern machine learning method well bit formal understanding work underlying theme course statistical inference provides foundation method covered electrical engineering computer science engineeringsciencecomputer scienceartificial intelligencemathematicsprobability statisticcognitive sciencealgorithm data structure'}

# BoW x Cosine

In [None]:
sentence = courses_df.loc[1797, 'text']

corpus_embeddings, vectorizer = bow_embeddings(courses_df['text'])
pd.DataFrame({"text": courses_df['text'], "bow_vectors": corpus_embeddings.toarray().tolist()}).to_csv('take-home-2/vectors/bow_vectors.csv', index=False)
# to reverse the cos similarity value, so that 0 is the most similar
dist = 1-cosine_similarity(vectorizer.transform([sentence]), corpus_embeddings).flatten()
bow_cos_dist = courses_df.iloc[np.argsort(dist), :]
bow_cos_dist.loc[:,['cos_dist']] = np.sort(dist)
bow_cos_dist.to_csv('take-home-2/similarities/bow_cos_dist.csv', index=False)
bow_cos_dist[:11]

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name,text,cos_dist
1797,6.867,Machine Learning,Graduate,Fall,2006.0,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...","Prof. Tommi Jaakkola, Ali Mohammad, Rohit Singh",Electrical Engineering and Computer Science,6867 introductory course machine learning give...,machine learning,"engineering, science, computer science, artifi...",electrical engineering computer science,machine learning 6867 introductory course mach...,6.661338e-16
1218,18.465,Topics in Statistics: Statistical Learning Theory,Graduate,Spring,2007.0,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Prof. Dmitry Panchenko,Mathematics,main goal course study generalization ability ...,topic statistic statistical learning theory,"engineering, science, computer science, artifi...",mathematics,topic statistic statistical learning theory ma...,0.5113727
715,6.036,Introduction to Machine Learning,Undergraduate,Fall,2020.0,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...","Prof. Duane Boning, Prof. Isaac Chuang, Prof. ...",Electrical Engineering and Computer Science,course introduces principle algorithm applicat...,introduction machine learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction machine learning course introduce...,0.5135772
1756,18.409,Algorithmic Aspects of Machine Learning,Graduate,Spring,2015.0,This course is organized around algorithmic is...,"Engineering, Computer Science, Mathematics, Ap...",Prof. Ankur Moitra,Mathematics,course organized around algorithmic issue aris...,algorithmic aspect machine learning,"engineering, computer science, mathematics, ap...",mathematics,algorithmic aspect machine learning course org...,0.5199206
1519,6.S897,Machine Learning for Healthcare,Graduate,Spring,2019.0,This course introduces students to machine lea...,"Engineering, Computer Science, Artificial Inte...","Prof. David Sontag, Prof. Peter Szolovits","Electrical Engineering and Computer Science, H...",course introduces student machine learning hea...,machine learning healthcare,"engineering, computer science, artificial inte...","electrical engineering computer science, healt...",machine learning healthcare course introduces ...,0.5409302
1697,18.065,"Matrix Methods in Data Analysis, Signal Proces...",Undergraduate,Spring,2018.0,Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Prof. Gilbert Strang,Mathematics,linear algebra concept key understanding creat...,matrix method data analysis signal processing ...,"engineering, mathematics, electrical engineeri...",mathematics,matrix method data analysis signal processing ...,0.5546331
173,15.097,Prediction: Machine Learning and Statistics,Graduate,Spring,2012.0,Prediction is at the heart of almost every sci...,"Engineering, Computer Science, Artificial Inte...",Prof. Cynthia Rudin,Sloan School of Management,prediction heart almost every scientific disci...,prediction machine learning statistic,"engineering, computer science, artificial inte...",sloan school management,prediction machine learning statistic predicti...,0.5607724
2134,9.520-A,Networks for Learning: Regression and Classifi...,Graduate,Spring,2001.0,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...","Prof. Tomaso Poggio, Dr. Alessandro Verri",Brain and Cognitive Sciences,course focus problem supervised learning withi...,network learning regression classification,"science, mathematics, probability statistic, c...",brain cognitive science,network learning regression classification cou...,0.5937594
259,18.657,Mathematics of Machine Learning,Graduate,Fall,2015.0,"Broadly speaking, Machine Learning refers to t...","Engineering, Computer Science, Artificial Inte...",Prof. Philippe Rigollet,Mathematics,broadly speaking machine learning refers autom...,mathematics machine learning,"engineering, computer science, artificial inte...",mathematics,mathematics machine learning broadly speaking ...,0.60964
154,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2006.0,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Prof. Tomaso Poggio,Brain and Cognitive Sciences,course upperlevel graduate student planning ca...,statistical learning theory application,"science, mathematics, probability statistic, b...",brain cognitive science,statistical learning theory application course...,0.6235877


# TF-IDF x Cosine

In [None]:
sentence = courses_df.loc[1797, 'text']

corpus_embeddings, vectorizer = tfidf_embeddings(courses_df['text'])
pd.DataFrame({"text": courses_df['text'], "tfidf_vectors": corpus_embeddings.toarray().tolist()}).to_csv('take-home-2/vectors/tfidf_vectors.csv', index=False)
# to reverse the cos similarity value, so that 0 is the most similar
dist = 1-cosine_similarity(vectorizer.transform([sentence]), corpus_embeddings).flatten()
tfidf_cos_dist = courses_df.iloc[np.argsort(dist), :]
tfidf_cos_dist.loc[:,['cos_dist']] = np.sort(dist)
tfidf_cos_dist.to_csv('take-home-2/similarities/tfidf_cos_dist.csv', index=False)
tfidf_cos_dist[:11]

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name,text,cos_dist
1797,6.867,Machine Learning,Graduate,Fall,2006.0,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...","Prof. Tommi Jaakkola, Ali Mohammad, Rohit Singh",Electrical Engineering and Computer Science,6867 introductory course machine learning give...,machine learning,"engineering, science, computer science, artifi...",electrical engineering computer science,machine learning 6867 introductory course mach...,0.0
1218,18.465,Topics in Statistics: Statistical Learning Theory,Graduate,Spring,2007.0,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Prof. Dmitry Panchenko,Mathematics,main goal course study generalization ability ...,topic statistic statistical learning theory,"engineering, science, computer science, artifi...",mathematics,topic statistic statistical learning theory ma...,0.525189
1756,18.409,Algorithmic Aspects of Machine Learning,Graduate,Spring,2015.0,This course is organized around algorithmic is...,"Engineering, Computer Science, Mathematics, Ap...",Prof. Ankur Moitra,Mathematics,course organized around algorithmic issue aris...,algorithmic aspect machine learning,"engineering, computer science, mathematics, ap...",mathematics,algorithmic aspect machine learning course org...,0.627604
607,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2003.0,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...","Dr. Sayan Mukherjee, Prof. Tomaso Poggio, Alex...",Brain and Cognitive Sciences,focus problem supervised learning perspective ...,statistical learning theory application,"engineering, science, computer science, artifi...",brain cognitive science,statistical learning theory application focus ...,0.63008
715,6.036,Introduction to Machine Learning,Undergraduate,Fall,2020.0,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...","Prof. Duane Boning, Prof. Isaac Chuang, Prof. ...",Electrical Engineering and Computer Science,course introduces principle algorithm applicat...,introduction machine learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction machine learning course introduce...,0.671841
173,15.097,Prediction: Machine Learning and Statistics,Graduate,Spring,2012.0,Prediction is at the heart of almost every sci...,"Engineering, Computer Science, Artificial Inte...",Prof. Cynthia Rudin,Sloan School of Management,prediction heart almost every scientific disci...,prediction machine learning statistic,"engineering, computer science, artificial inte...",sloan school management,prediction machine learning statistic predicti...,0.678863
2134,9.520-A,Networks for Learning: Regression and Classifi...,Graduate,Spring,2001.0,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...","Prof. Tomaso Poggio, Dr. Alessandro Verri",Brain and Cognitive Sciences,course focus problem supervised learning withi...,network learning regression classification,"science, mathematics, probability statistic, c...",brain cognitive science,network learning regression classification cou...,0.683056
1697,18.065,"Matrix Methods in Data Analysis, Signal Proces...",Undergraduate,Spring,2018.0,Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Prof. Gilbert Strang,Mathematics,linear algebra concept key understanding creat...,matrix method data analysis signal processing ...,"engineering, mathematics, electrical engineeri...",mathematics,matrix method data analysis signal processing ...,0.690946
1519,6.S897,Machine Learning for Healthcare,Graduate,Spring,2019.0,This course introduces students to machine lea...,"Engineering, Computer Science, Artificial Inte...","Prof. David Sontag, Prof. Peter Szolovits","Electrical Engineering and Computer Science, H...",course introduces student machine learning hea...,machine learning healthcare,"engineering, computer science, artificial inte...","electrical engineering computer science, healt...",machine learning healthcare course introduces ...,0.704394
154,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2006.0,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Prof. Tomaso Poggio,Brain and Cognitive Sciences,course upperlevel graduate student planning ca...,statistical learning theory application,"science, mathematics, probability statistic, b...",brain cognitive science,statistical learning theory application course...,0.727064


# Word2Vec x WMD

In [None]:
sentence = courses_df.loc[1797, 'text']

kv = word2vec_model(courses_df['text'])
pd.DataFrame({"word": kv.index_to_key, "vectors": [kv[word] for word in kv.index_to_key]}).to_csv('take-home-2/vectors/w2v_word_vectors.csv', index=False)
dists = []
for text in courses_df['text']:
    # distance between two list of word
    dist = kv.wmdistance(nltk.word_tokenize(sentence), nltk.word_tokenize(text))
    dists.append(dist)

w2v_wmd_dist = courses_df.iloc[np.argsort(dists), :]
w2v_wmd_dist.loc[:,['wmd_dist']] = np.sort(dists)
w2v_wmd_dist.to_csv('take-home-2/similarities/w2v_wmd_dist.csv', index=False)
w2v_wmd_dist[:11]

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name,text,wmd_dist
1797,6.867,Machine Learning,Graduate,Fall,2006.0,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...","Prof. Tommi Jaakkola, Ali Mohammad, Rohit Singh",Electrical Engineering and Computer Science,6867 introductory course machine learning give...,machine learning,"engineering, science, computer science, artifi...",electrical engineering computer science,machine learning 6867 introductory course mach...,0.0
1780,6.825,Techniques in Artificial Intelligence (SMA 5504),Graduate,Fall,2002.0,6.825 is a graduate-level introduction to arti...,"Engineering, Science, Computer Science, Artifi...","Prof. Leslie Kaelbling, Prof. Tomás Lozano-Pérez",Electrical Engineering and Computer Science,6825 graduatelevel introduction artificial int...,technique artificial intelligence sma 5504,"engineering, science, computer science, artifi...",electrical engineering computer science,technique artificial intelligence sma 5504 682...,0.693753
1218,18.465,Topics in Statistics: Statistical Learning Theory,Graduate,Spring,2007.0,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Prof. Dmitry Panchenko,Mathematics,main goal course study generalization ability ...,topic statistic statistical learning theory,"engineering, science, computer science, artifi...",mathematics,topic statistic statistical learning theory ma...,0.702761
154,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2006.0,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Prof. Tomaso Poggio,Brain and Cognitive Sciences,course upperlevel graduate student planning ca...,statistical learning theory application,"science, mathematics, probability statistic, b...",brain cognitive science,statistical learning theory application course...,0.72138
2134,9.520-A,Networks for Learning: Regression and Classifi...,Graduate,Spring,2001.0,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...","Prof. Tomaso Poggio, Dr. Alessandro Verri",Brain and Cognitive Sciences,course focus problem supervised learning withi...,network learning regression classification,"science, mathematics, probability statistic, c...",brain cognitive science,network learning regression classification cou...,0.726982
715,6.036,Introduction to Machine Learning,Undergraduate,Fall,2020.0,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...","Prof. Duane Boning, Prof. Isaac Chuang, Prof. ...",Electrical Engineering and Computer Science,course introduces principle algorithm applicat...,introduction machine learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction machine learning course introduce...,0.729386
1697,18.065,"Matrix Methods in Data Analysis, Signal Proces...",Undergraduate,Spring,2018.0,Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Prof. Gilbert Strang,Mathematics,linear algebra concept key understanding creat...,matrix method data analysis signal processing ...,"engineering, mathematics, electrical engineeri...",mathematics,matrix method data analysis signal processing ...,0.73394
607,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2003.0,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...","Dr. Sayan Mukherjee, Prof. Tomaso Poggio, Alex...",Brain and Cognitive Sciences,focus problem supervised learning perspective ...,statistical learning theory application,"engineering, science, computer science, artifi...",brain cognitive science,statistical learning theory application focus ...,0.734341
2286,6.438,Algorithms for Inference,Graduate,Fall,2014.0,This is a graduate-level introduction to the p...,"Engineering, Computer Science, Mathematics, Pr...",Prof. Devavrat Shah,Electrical Engineering and Computer Science,graduatelevel introduction principle statistic...,algorithm inference,"engineering, computer science, mathematics, pr...",electrical engineering computer science,algorithm inference graduatelevel introduction...,0.734567
2136,HST.951J,Medical Decision Support,Graduate,Fall,2005.0,This course presents the main concepts of deci...,"Engineering, Computer Science, Artificial Inte...","Prof. Lucila Ohno-Machado, Prof. Staal Vinterbo","Health Sciences and Technology, Electrical Eng...",course present main concept decision analysis ...,medical decision support,"engineering, computer science, artificial inte...","health science technology, electrical engineer...",medical decision support course present main c...,0.740701


# Word2Vec x Cosine

Instead of calculating distance by word, we can try to calculate distance by sentence. We need to convert a list of vectors for each document into a single vector by averaging the vectors value.

In [None]:
def avg_vector(vectors:list, n_words:int) -> list:
    # Calculate the average vector from a list of vectors
    vector_sum = np.sum(vectors, axis=0)
    if n_words != 0:
        return np.divide(vector_sum, n_words)

In [None]:
corpus_vectors = []

# Create a list of vectors for each text in the corpus
for text in courses_df['text']:
    tokens = nltk.word_tokenize(text)
    vectors = []
    for token in tokens:
        if token in kv:
            vectors.append(kv[token])
        else:
            # if the token is not in the model, we use a vector of ones
            vectors.append(np.ones(kv.vector_size, dtype='float32'))
    corpus_vectors.append(vectors)

# Average the word vectors for each text
corpus_vectors = [avg_vector(vector, len(vector)) for vector in corpus_vectors]

pd.DataFrame({"text": courses_df['text'], "w2v_sentence_vector": corpus_vectors}).to_csv('take-home-2/vectors/w2v_sentence_vectors.csv', index=False)

sentence = courses_df.loc[1797, 'text']
sentence_tokens = nltk.word_tokenize(sentence)
sentence_vector = [avg_vector([kv[token] if token in kv else np.ones(kv.vector_size, dtype='float32') for token in sentence_tokens], len(sentence_tokens))]

# reverse the cos similarity value, so that 0 is the most similar
dist = 1-cosine_similarity(sentence_vector, corpus_vectors).flatten()
w2v_cos_dist = courses_df.iloc[np.argsort(dist), :]
w2v_cos_dist.loc[:,['cos_dist']] = np.sort(dist)
w2v_cos_dist.to_csv('take-home-2/similarities/w2v_cos_dist.csv', index=False)
w2v_cos_dist[:11]

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name,text,cos_dist
1797,6.867,Machine Learning,Graduate,Fall,2006.0,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...","Prof. Tommi Jaakkola, Ali Mohammad, Rohit Singh",Electrical Engineering and Computer Science,6867 introductory course machine learning give...,machine learning,"engineering, science, computer science, artifi...",electrical engineering computer science,machine learning 6867 introductory course mach...,5.960464e-08
1631,6.S191,Introduction to Deep Learning,Undergraduate,January IAP,2020.0,This is MIT's introductory course on deep lear...,"Engineering, Computer Science, Artificial Inte...","Alexander Amini, Ava Soleimany",Electrical Engineering and Computer Science,mit introductory course deep learning method a...,introduction deep learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction deep learning mit introductory co...,0.01353943
607,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2003.0,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...","Dr. Sayan Mukherjee, Prof. Tomaso Poggio, Alex...",Brain and Cognitive Sciences,focus problem supervised learning perspective ...,statistical learning theory application,"engineering, science, computer science, artifi...",brain cognitive science,statistical learning theory application focus ...,0.01458144
2001,MAS.622J,Pattern Recognition and Analysis,Graduate,Fall,2006.0,This class deals with the fundamentals of char...,"Engineering, Science, Mathematics, Probability...","Media Lab Faculty and Staff, Bo Morgan, Prof. ...","Media Arts and Sciences, Civil and Environment...",class deal fundamental characterizing recogniz...,pattern recognition analysis,"engineering, science, mathematics, probability...","medium art science, civil environmental engine...",pattern recognition analysis class deal fundam...,0.01546454
715,6.036,Introduction to Machine Learning,Undergraduate,Fall,2020.0,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...","Prof. Duane Boning, Prof. Isaac Chuang, Prof. ...",Electrical Engineering and Computer Science,course introduces principle algorithm applicat...,introduction machine learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction machine learning course introduce...,0.01668727
98,6.041,Probabilistic Systems Analysis and Applied Pro...,Undergraduate,Fall,2010.0,"Welcome to 6.041/6.431, a subject on the model...","Engineering, Mathematics, Systems Engineering,...",Prof. John Tsitsiklis,Electrical Engineering and Computer Science,welcome 60416431 subject modeling analysis ran...,probabilistic system analysis applied probability,"engineering, mathematics, system engineering, ...",electrical engineering computer science,probabilistic system analysis applied probabil...,0.01685762
2245,6.080,Great Ideas in Theoretical Computer Science,Undergraduate,Spring,2008.0,This course provides a challenging introductio...,"Engineering, Computer Science, Mathematics, Ap...",Prof. Scott Aaronson,Electrical Engineering and Computer Science,course provides challenging introduction centr...,great idea theoretical computer science,"engineering, computer science, mathematics, ap...",electrical engineering computer science,great idea theoretical computer science course...,0.01773494
1705,6.01SC,Introduction to Electrical Engineering and Com...,Undergraduate,Spring,2011.0,This course provides an integrated introductio...,"Engineering, Computer Science, Artificial Inte...","Prof. Harold Abelson, Prof. Isaac Chuang, Prof...",Electrical Engineering and Computer Science,course provides integrated introduction electr...,introduction electrical engineering computer s...,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction electrical engineering computer s...,0.01817471
721,6.S980,Machine Learning for Inverse Graphics,Graduate,Fall,2022.0,This course covers fundamental and advanced te...,"Engineering, Computer Science, Artificial Inte...",Prof. Vincent Sitzmann,Electrical Engineering and Computer Science,course cover fundamental advanced technique fi...,machine learning inverse graphic,"engineering, computer science, artificial inte...",electrical engineering computer science,machine learning inverse graphic course cover ...,0.01864868
1699,RES.9-009,Introduction to Computational Neuroscience wit...,"Graduate, Non-Credit",January IAP,2025.0,"In this course, you will learn the basics of c...","Engineering, Science, Computer Science, Health...","Prof. Lilianne Mujica-Parodi, Dr. Haris Organt...",Brain and Cognitive Sciences,course learn basic computational neuroscience ...,introduction computational neuroscience neuroblox,"engineering, science, computer science, health...",brain cognitive science,introduction computational neuroscience neurob...,0.01886159


# FastText x WMD

In [14]:
sentence = courses_df.loc[1797, 'text']

ft = fasttext_model(courses_df['text'])
pd.DataFrame({"word": ft.index_to_key, "vectors": [ft[word] for word in ft.index_to_key]}).to_csv('take-home-2/vectors/ft_word_vectors.csv', index=False)
dists = []
for text in courses_df['text']:
    dist = ft.wmdistance(nltk.word_tokenize(sentence), nltk.word_tokenize(text))
    dists.append(dist)
    
ft_wmd_dist = courses_df.iloc[np.argsort(dists), :]
ft_wmd_dist.loc[:,['wmd_dist']] = np.sort(dists)
ft_wmd_dist.to_csv('take-home-2/similarities/ft_wmd_dist.csv', index=False)
ft_wmd_dist[:11]

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name,text,wmd_dist
1797,6.867,Machine Learning,Graduate,Fall,2006.0,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...","Prof. Tommi Jaakkola, Ali Mohammad, Rohit Singh",Electrical Engineering and Computer Science,6867 introductory course machine learning give...,machine learning,"engineering, science, computer science, artifi...",electrical engineering computer science,machine learning 6867 introductory course mach...,0.0
1780,6.825,Techniques in Artificial Intelligence (SMA 5504),Graduate,Fall,2002.0,6.825 is a graduate-level introduction to arti...,"Engineering, Science, Computer Science, Artifi...","Prof. Leslie Kaelbling, Prof. Tomás Lozano-Pérez",Electrical Engineering and Computer Science,6825 graduatelevel introduction artificial int...,technique artificial intelligence sma 5504,"engineering, science, computer science, artifi...",electrical engineering computer science,technique artificial intelligence sma 5504 682...,0.820359
1218,18.465,Topics in Statistics: Statistical Learning Theory,Graduate,Spring,2007.0,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Prof. Dmitry Panchenko,Mathematics,main goal course study generalization ability ...,topic statistic statistical learning theory,"engineering, science, computer science, artifi...",mathematics,topic statistic statistical learning theory ma...,0.820833
607,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2003.0,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...","Dr. Sayan Mukherjee, Prof. Tomaso Poggio, Alex...",Brain and Cognitive Sciences,focus problem supervised learning perspective ...,statistical learning theory application,"engineering, science, computer science, artifi...",brain cognitive science,statistical learning theory application focus ...,0.86717
715,6.036,Introduction to Machine Learning,Undergraduate,Fall,2020.0,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...","Prof. Duane Boning, Prof. Isaac Chuang, Prof. ...",Electrical Engineering and Computer Science,course introduces principle algorithm applicat...,introduction machine learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction machine learning course introduce...,0.889718
1519,6.S897,Machine Learning for Healthcare,Graduate,Spring,2019.0,This course introduces students to machine lea...,"Engineering, Computer Science, Artificial Inte...","Prof. David Sontag, Prof. Peter Szolovits","Electrical Engineering and Computer Science, H...",course introduces student machine learning hea...,machine learning healthcare,"engineering, computer science, artificial inte...","electrical engineering computer science, healt...",machine learning healthcare course introduces ...,0.909089
154,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2006.0,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Prof. Tomaso Poggio,Brain and Cognitive Sciences,course upperlevel graduate student planning ca...,statistical learning theory application,"science, mathematics, probability statistic, b...",brain cognitive science,statistical learning theory application course...,0.913724
259,18.657,Mathematics of Machine Learning,Graduate,Fall,2015.0,"Broadly speaking, Machine Learning refers to t...","Engineering, Computer Science, Artificial Inte...",Prof. Philippe Rigollet,Mathematics,broadly speaking machine learning refers autom...,mathematics machine learning,"engineering, computer science, artificial inte...",mathematics,mathematics machine learning broadly speaking ...,0.916447
2134,9.520-A,Networks for Learning: Regression and Classifi...,Graduate,Spring,2001.0,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...","Prof. Tomaso Poggio, Dr. Alessandro Verri",Brain and Cognitive Sciences,course focus problem supervised learning withi...,network learning regression classification,"science, mathematics, probability statistic, c...",brain cognitive science,network learning regression classification cou...,0.920859
1697,18.065,"Matrix Methods in Data Analysis, Signal Proces...",Undergraduate,Spring,2018.0,Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Prof. Gilbert Strang,Mathematics,linear algebra concept key understanding creat...,matrix method data analysis signal processing ...,"engineering, mathematics, electrical engineeri...",mathematics,matrix method data analysis signal processing ...,0.922326


# Sentence Transformers x Cosine

In [None]:
sentence = courses_df.loc[1797, 'text']

corpus_embeddings, model = sentence_transformers_embeddings(courses_df['text'])
pd.DataFrame({"text": courses_df['text'], "st_vectors": corpus_embeddings.tolist()}).to_csv('take-home-2/vectors/st_vectors.csv', index=False)
# reverse the cos similarity value, so that 0 is the most similar
dists = 1-cosine_similarity([model.encode(sentence)], corpus_embeddings).flatten()
st_cos_dist = courses_df.iloc[np.argsort(dists), :]
st_cos_dist.loc[:,['cos_dist']] = np.sort(dists)
st_cos_dist.to_csv('take-home-2/similarities/st_cos_dist.csv', index=False)
st_cos_dist[:11]

Unnamed: 0,number,title,level,semester,year,description,topics,instructors,department_name,preprocessed_description,preprocessed_title,preprocessed_topics,preprocessed_department_name,text,cos_dist
1797,6.867,Machine Learning,Graduate,Fall,2006.0,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...","Prof. Tommi Jaakkola, Ali Mohammad, Rohit Singh",Electrical Engineering and Computer Science,6867 introductory course machine learning give...,machine learning,"engineering, science, computer science, artifi...",electrical engineering computer science,machine learning 6867 introductory course mach...,1.192093e-07
1218,18.465,Topics in Statistics: Statistical Learning Theory,Graduate,Spring,2007.0,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Prof. Dmitry Panchenko,Mathematics,main goal course study generalization ability ...,topic statistic statistical learning theory,"engineering, science, computer science, artifi...",mathematics,topic statistic statistical learning theory ma...,0.19941
1780,6.825,Techniques in Artificial Intelligence (SMA 5504),Graduate,Fall,2002.0,6.825 is a graduate-level introduction to arti...,"Engineering, Science, Computer Science, Artifi...","Prof. Leslie Kaelbling, Prof. Tomás Lozano-Pérez",Electrical Engineering and Computer Science,6825 graduatelevel introduction artificial int...,technique artificial intelligence sma 5504,"engineering, science, computer science, artifi...",electrical engineering computer science,technique artificial intelligence sma 5504 682...,0.2439162
2001,MAS.622J,Pattern Recognition and Analysis,Graduate,Fall,2006.0,This class deals with the fundamentals of char...,"Engineering, Science, Mathematics, Probability...","Media Lab Faculty and Staff, Bo Morgan, Prof. ...","Media Arts and Sciences, Civil and Environment...",class deal fundamental characterizing recogniz...,pattern recognition analysis,"engineering, science, mathematics, probability...","medium art science, civil environmental engine...",pattern recognition analysis class deal fundam...,0.2652936
715,6.036,Introduction to Machine Learning,Undergraduate,Fall,2020.0,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...","Prof. Duane Boning, Prof. Isaac Chuang, Prof. ...",Electrical Engineering and Computer Science,course introduces principle algorithm applicat...,introduction machine learning,"engineering, computer science, artificial inte...",electrical engineering computer science,introduction machine learning course introduce...,0.2720942
259,18.657,Mathematics of Machine Learning,Graduate,Fall,2015.0,"Broadly speaking, Machine Learning refers to t...","Engineering, Computer Science, Artificial Inte...",Prof. Philippe Rigollet,Mathematics,broadly speaking machine learning refers autom...,mathematics machine learning,"engineering, computer science, artificial inte...",mathematics,mathematics machine learning broadly speaking ...,0.2878628
2134,9.520-A,Networks for Learning: Regression and Classifi...,Graduate,Spring,2001.0,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...","Prof. Tomaso Poggio, Dr. Alessandro Verri",Brain and Cognitive Sciences,course focus problem supervised learning withi...,network learning regression classification,"science, mathematics, probability statistic, c...",brain cognitive science,network learning regression classification cou...,0.2889742
173,15.097,Prediction: Machine Learning and Statistics,Graduate,Spring,2012.0,Prediction is at the heart of almost every sci...,"Engineering, Computer Science, Artificial Inte...",Prof. Cynthia Rudin,Sloan School of Management,prediction heart almost every scientific disci...,prediction machine learning statistic,"engineering, computer science, artificial inte...",sloan school management,prediction machine learning statistic predicti...,0.2956704
154,9.520,Statistical Learning Theory and Applications,Graduate,Spring,2006.0,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Prof. Tomaso Poggio,Brain and Cognitive Sciences,course upperlevel graduate student planning ca...,statistical learning theory application,"science, mathematics, probability statistic, b...",brain cognitive science,statistical learning theory application course...,0.3100022
1697,18.065,"Matrix Methods in Data Analysis, Signal Proces...",Undergraduate,Spring,2018.0,Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Prof. Gilbert Strang,Mathematics,linear algebra concept key understanding creat...,matrix method data analysis signal processing ...,"engineering, mathematics, electrical engineeri...",mathematics,matrix method data analysis signal processing ...,0.3104867


# Model Evaluation

We don't have the ground truth, so we will manually check relevancy of recommended items in all methods

## Traditional Methods (BoW, TF-IDF)

In [None]:
pd.set_option('display.max_rows', 11)

print("BoW x Cosine Method:")
display(bow_cos_dist.loc[:11, ['title', 'description', 'topics', 'department_name', 'cos_dist']])

print("TFIDF x Cosine Method:")
display(tfidf_cos_dist.loc[:11, ['title', 'description', 'topics', 'department_name', 'cos_dist']])

BoW x Cosine Method:


Unnamed: 0,title,description,topics,department_name,cos_dist
1797,Machine Learning,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,6.661338e-16
1218,Topics in Statistics: Statistical Learning Theory,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Mathematics,0.5113727
715,Introduction to Machine Learning,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.5135772
1756,Algorithmic Aspects of Machine Learning,This course is organized around algorithmic is...,"Engineering, Computer Science, Mathematics, Ap...",Mathematics,0.5199206
1519,Machine Learning for Healthcare,This course introduces students to machine lea...,"Engineering, Computer Science, Artificial Inte...","Electrical Engineering and Computer Science, H...",0.5409302
1697,"Matrix Methods in Data Analysis, Signal Proces...",Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Mathematics,0.5546331
173,Prediction: Machine Learning and Statistics,Prediction is at the heart of almost every sci...,"Engineering, Computer Science, Artificial Inte...",Sloan School of Management,0.5607724
2134,Networks for Learning: Regression and Classifi...,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.5937594
259,Mathematics of Machine Learning,"Broadly speaking, Machine Learning refers to t...","Engineering, Computer Science, Artificial Inte...",Mathematics,0.60964
154,Statistical Learning Theory and Applications,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.6235877


TFIDF x Cosine Method:


Unnamed: 0,title,description,topics,department_name,cos_dist
1797,Machine Learning,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,0.0
1218,Topics in Statistics: Statistical Learning Theory,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Mathematics,0.525189
1756,Algorithmic Aspects of Machine Learning,This course is organized around algorithmic is...,"Engineering, Computer Science, Mathematics, Ap...",Mathematics,0.627604
607,Statistical Learning Theory and Applications,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...",Brain and Cognitive Sciences,0.63008
715,Introduction to Machine Learning,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.671841
173,Prediction: Machine Learning and Statistics,Prediction is at the heart of almost every sci...,"Engineering, Computer Science, Artificial Inte...",Sloan School of Management,0.678863
2134,Networks for Learning: Regression and Classifi...,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.683056
1697,"Matrix Methods in Data Analysis, Signal Proces...",Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Mathematics,0.690946
1519,Machine Learning for Healthcare,This course introduces students to machine lea...,"Engineering, Computer Science, Artificial Inte...","Electrical Engineering and Computer Science, H...",0.704394
154,Statistical Learning Theory and Applications,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.727064


BoW and TF-IDF with cosine similarity produce good results. Most of the courses recommended are similar to original course ("Machine Learning"), featuring keywords such as "Statistics", "Artificial Intelligence", "Data Analysis", and "Mathematics". 

Both methods produce almost identical results, with the main difference being the order of similarity. However, "Algorithmic Aspects of Machine Learning" is present in TF-IDF method but not in BoW method. Conversely, "Techniques in Artificial Intelligence (SMA 5504)" is present in BoW method but not in TF-IDF method.

## Modern Methods (Word2Vec, FastText, SentenceTransformers)

In [None]:
print("Word2Vec (Pre-Trained on WikiNews) x WMD Method:")
display(w2v_wmd_dist.loc[:11, ['title', 'description', 'topics', 'department_name', 'wmd_dist']])

print("Word2Vec (Pre-Trained on WikiNews) x Cosine Method:")
display(w2v_cos_dist.loc[:11, ['title', 'description', 'topics', 'department_name', 'cos_dist']])

print("FastText (Trained on This Corpus) x WMD Method:")
display(ft_wmd_dist.loc[:11, ['title', 'description', 'topics', 'department_name', 'wmd_dist']])

print("Sentence Transformers (all-MiniLM-L6-v2) x Cosine Method:")
display(st_cos_dist.loc[:11, ['title', 'description', 'topics', 'department_name', 'cos_dist']])

Word2Vec (Pre-Trained on WikiNews) x WMD Method:


Unnamed: 0,title,description,topics,department_name,wmd_dist
1797,Machine Learning,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,0.0
1780,Techniques in Artificial Intelligence (SMA 5504),6.825 is a graduate-level introduction to arti...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,0.693753
1218,Topics in Statistics: Statistical Learning Theory,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Mathematics,0.702761
154,Statistical Learning Theory and Applications,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.72138
2134,Networks for Learning: Regression and Classifi...,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.726982
715,Introduction to Machine Learning,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.729386
1697,"Matrix Methods in Data Analysis, Signal Proces...",Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Mathematics,0.73394
607,Statistical Learning Theory and Applications,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...",Brain and Cognitive Sciences,0.734341
2286,Algorithms for Inference,This is a graduate-level introduction to the p...,"Engineering, Computer Science, Mathematics, Pr...",Electrical Engineering and Computer Science,0.734567
2136,Medical Decision Support,This course presents the main concepts of deci...,"Engineering, Computer Science, Artificial Inte...","Health Sciences and Technology, Electrical Eng...",0.740701


Word2Vec (Pre-Trained on WikiNews) x Cosine Method:


Unnamed: 0,title,description,topics,department_name,cos_dist
1797,Machine Learning,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,5.960464e-08
1631,Introduction to Deep Learning,This is MIT's introductory course on deep lear...,"Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.01353943
607,Statistical Learning Theory and Applications,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...",Brain and Cognitive Sciences,0.01458144
2001,Pattern Recognition and Analysis,This class deals with the fundamentals of char...,"Engineering, Science, Mathematics, Probability...","Media Arts and Sciences, Civil and Environment...",0.01546454
715,Introduction to Machine Learning,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.01668727
98,Probabilistic Systems Analysis and Applied Pro...,"Welcome to 6.041/6.431, a subject on the model...","Engineering, Mathematics, Systems Engineering,...",Electrical Engineering and Computer Science,0.01685762
2245,Great Ideas in Theoretical Computer Science,This course provides a challenging introductio...,"Engineering, Computer Science, Mathematics, Ap...",Electrical Engineering and Computer Science,0.01773494
1705,Introduction to Electrical Engineering and Com...,This course provides an integrated introductio...,"Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.01817471
721,Machine Learning for Inverse Graphics,This course covers fundamental and advanced te...,"Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.01864868
1699,Introduction to Computational Neuroscience wit...,"In this course, you will learn the basics of c...","Engineering, Science, Computer Science, Health...",Brain and Cognitive Sciences,0.01886159


FastText (Trained on This Corpus) x WMD Method:


Unnamed: 0,title,description,topics,department_name,wmd_dist
1797,Machine Learning,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,0.0
1780,Techniques in Artificial Intelligence (SMA 5504),6.825 is a graduate-level introduction to arti...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,0.820359
1218,Topics in Statistics: Statistical Learning Theory,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Mathematics,0.820833
607,Statistical Learning Theory and Applications,Focuses on the problem of supervised learning ...,"Engineering, Science, Computer Science, Artifi...",Brain and Cognitive Sciences,0.86717
715,Introduction to Machine Learning,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.889718
1519,Machine Learning for Healthcare,This course introduces students to machine lea...,"Engineering, Computer Science, Artificial Inte...","Electrical Engineering and Computer Science, H...",0.909089
154,Statistical Learning Theory and Applications,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.913724
259,Mathematics of Machine Learning,"Broadly speaking, Machine Learning refers to t...","Engineering, Computer Science, Artificial Inte...",Mathematics,0.916447
2134,Networks for Learning: Regression and Classifi...,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.920859
1697,"Matrix Methods in Data Analysis, Signal Proces...",Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Mathematics,0.922326


Sentence Transformers (all-MiniLM-L6-v2) x Cosine Method:


Unnamed: 0,title,description,topics,department_name,cos_dist
1797,Machine Learning,6.867 is an introductory course on machine lea...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,1.192093e-07
1218,Topics in Statistics: Statistical Learning Theory,The main goal of this course is to study the g...,"Engineering, Science, Computer Science, Artifi...",Mathematics,0.19941
1780,Techniques in Artificial Intelligence (SMA 5504),6.825 is a graduate-level introduction to arti...,"Engineering, Science, Computer Science, Artifi...",Electrical Engineering and Computer Science,0.2439162
2001,Pattern Recognition and Analysis,This class deals with the fundamentals of char...,"Engineering, Science, Mathematics, Probability...","Media Arts and Sciences, Civil and Environment...",0.2652936
715,Introduction to Machine Learning,"This course introduces principles, algorithms,...","Engineering, Computer Science, Artificial Inte...",Electrical Engineering and Computer Science,0.2720942
259,Mathematics of Machine Learning,"Broadly speaking, Machine Learning refers to t...","Engineering, Computer Science, Artificial Inte...",Mathematics,0.2878628
2134,Networks for Learning: Regression and Classifi...,The course focuses on the problem of supervise...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.2889742
173,Prediction: Machine Learning and Statistics,Prediction is at the heart of almost every sci...,"Engineering, Computer Science, Artificial Inte...",Sloan School of Management,0.2956704
154,Statistical Learning Theory and Applications,This course is for upper-level graduate studen...,"Science, Mathematics, Probability and Statisti...",Brain and Cognitive Sciences,0.3100022
1697,"Matrix Methods in Data Analysis, Signal Proces...",Linear algebra concepts are key for understand...,"Engineering, Mathematics, Electrical Engineeri...",Mathematics,0.3104867


All modern methods produce good results, recommending items similar to original course ("Machine Laerning"). Resulting items with keywords such as "Statistics", "Artificial Intelligence", "Data Analysis", and "Mathematics". 

However, there are courses recommended that are not present in traditional methods, such as:
* "Pattern Recognition and Analysis" from ST x Cosine
* "Probabilistic Systems Analysis and Applied Probability" from W2V x Cosine
* "Algorithms for Inference" from W2V X WMD

This shows that modern methods are able to capture more semantically rich context and do not solely rely on keyword frequency, unlike traditional methods.

# Conclusion

For this problem, I believe traditional methods are preferable due to their good results and fast performance. Since the task involves only text similarity based on word occurrences, and the course descriptions are descriptive (without requiring sentiment analysis), traditional methods are sufficient.

The FastText method, even when trained on a relatively small corpus, also performs well and is comparable to pre-trained models like W2V and ST.