In [2]:
import pandas as pd

In [3]:
data_5s = pd.read_json("data_5scheduler.json")
data_5s = data_5s.dropna(subset=["description", "title"]) # looks like nothing actually gets dropped by this
data_5s.head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee
0,Introduction to American Cultures,AMST-103-HM,An interdisciplinary introduction to principal...,HarveyMudd,300,[Staff],,,,False,0
1,Print and American Culture,AMST-115-HM,Covers numerous developments in American print...,HarveyMudd,300,[Anup Gampa],,,,True,0
2,Hyphenated Americans,AMST-120-HM,A focus on the experience of immigrants in the...,HarveyMudd,300,[Balseiro],,,,False,0
3,"Life: Knowledge, Belief, and Cultural Practices",ANTH-110-HM,An exploration of cultural attitudes toward li...,HarveyMudd,300,[de Laet],,,,False,0
4,Introduction to the Anthropology of Science an...,ANTH-111-HM,An introduction to science and technology as c...,HarveyMudd,300,[Marianne De Laet],,,,True,0
5,War and Conflict,ANTH-115-HM,“The wings of the butterfly—that cause the hur...,HarveyMudd,300,[de Laet],,,,False,0
6,Rationalities,ANTH-134-HM,What does it mean to be rational? Does it mean...,HarveyMudd,300,[de Laet],Offered alternate years,Any introductory course in anthropology or any...,,False,0
7,A History of Landscape Photography,ARHI-131-HM,This course explores how photographic landscap...,HarveyMudd,300,[Fandell],,,,False,0
8,Modern and Contemporary Art Practices,ART-002-HM,This class is an experimental lecture style ar...,HarveyMudd,300,[Fandell],,,,False,0
9,Photography,ART-033-HM,Approaching the medium from an artistic perspe...,HarveyMudd,300,[Fandell],,ART002 HM,,False,150


In [4]:
data_5s.describe()

Unnamed: 0,credits,fee
count,4443.0,4443.0
mean,85.253657,0.082152
std,70.623956,2.864374
min,0.0,0.0
25%,25.0,0.0
50%,100.0,0.0
75%,100.0,0.0
max,600.0,150.0


In [5]:
data_stripped = data_5s.drop(["source", "credits", "instructors", "offered", "prerequisites", "corequisites", "currently_offered", "fee"], axis=1)

In [6]:
data_stripped.head()

Unnamed: 0,title,identifier,description
0,Introduction to American Cultures,AMST-103-HM,An interdisciplinary introduction to principal...
1,Print and American Culture,AMST-115-HM,Covers numerous developments in American print...
2,Hyphenated Americans,AMST-120-HM,A focus on the experience of immigrants in the...
3,"Life: Knowledge, Belief, and Cultural Practices",ANTH-110-HM,An exploration of cultural attitudes toward li...
4,Introduction to the Anthropology of Science an...,ANTH-111-HM,An introduction to science and technology as c...


In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wwsam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
data_tokenized = data_stripped.copy()
data_tokenized["title_tokenized"] = data_tokenized.title.apply(lambda x: [i.lower() for i in word_tokenize(x) if i not in stop_words and i not in string.punctuation])
data_tokenized["description_tokenized"] = data_tokenized.description.apply(lambda x: [i.lower() for i in word_tokenize(x) if i not in stop_words and i not in string.punctuation])
data_tokenized["tokens"] = data_tokenized.title_tokenized + data_tokenized.description_tokenized
data_tokenized.drop(["title", "description", "title_tokenized", "description_tokenized"], axis=1, inplace=True)
data_tokenized.head()

Unnamed: 0,identifier,tokens
0,AMST-103-HM,"[introduction, american, cultures, an, interdi..."
1,AMST-115-HM,"[print, american, culture, covers, numerous, d..."
2,AMST-120-HM,"[hyphenated, americans, a, focus, experience, ..."
3,ANTH-110-HM,"[life, knowledge, belief, cultural, practices,..."
4,ANTH-111-HM,"[introduction, anthropology, science, technolo..."


In [14]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=data_tokenized.tokens, min_count=2)
print(model.wv)

KeyedVectors<vector_size=100, 9481 keys>


In [15]:
def get_average_vector(tokens):
    vectors = [model.wv[i] for i in tokens if i in model.wv.key_to_index]
    average_vector = sum(vectors)/len(vectors)
    return average_vector

data_vectorized = data_tokenized.copy()
data_vectorized["vector"] = data_vectorized.tokens.apply(get_average_vector)
data_vectorized.drop("tokens", axis=1, inplace=True)
data_vectorized.head()

Unnamed: 0,identifier,vector
0,AMST-103-HM,"[-0.3426588, 0.9090205, 0.7246097, 0.3284848, ..."
1,AMST-115-HM,"[-0.30454403, 0.42112118, 0.25755385, 0.166223..."
2,AMST-120-HM,"[-0.40267086, 0.63932186, 0.43701193, 0.217241..."
3,ANTH-110-HM,"[-0.33505717, 0.6570204, 0.470196, 0.20953804,..."
4,ANTH-111-HM,"[-0.2688371, 0.70981526, 0.5498713, 0.17443745..."


In [16]:
data_vectorized[data_vectorized['identifier'] == 'AMST-103-HM'].loc[0]

identifier                                          AMST-103-HM
vector        [-0.3426588, 0.9090205, 0.7246097, 0.3284848, ...
Name: 0, dtype: object

In [17]:
import numpy as np

def get_similar_courses(search_course, data_vectorized, data_5s):
    this_vector = data_vectorized[data_vectorized['identifier'] == 'AMST-103-HM'].loc[0].vector
    data_search = data_vectorized[data_vectorized.identifier != search_course]
    data_info = data_5s.copy()
    data_info["cosine_sim"] = data_search.apply(lambda x: np.inner(x.vector, this_vector) / (np.linalg.norm(x.vector) * np.linalg.norm(this_vector)), axis=1)
    data_info.sort_values("cosine_sim", inplace=True, ascending=False)
    return data_info.head(10)

In [18]:
get_similar_courses("AMST-103-HM", data_vectorized, data_5s)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,cosine_sim
3043,Seminar in Literary Theory,ENGL-180-SC,This course provides an introduction to litera...,Scripps,100,[Preston Waltrip],,,,True,0,0.99543
3771,"Fantastic Archaeology: Modern Myths, Pseudo-Sc...",CLAS-020-PZ,An exploration of popular and fantastic interp...,Pitzer,100,[Michelle Berenfeld],,,,True,0,0.995419
3885,Natural History and Naturalists: History and P...,EA-108-PZ,The interdisciplinary field of Natural History...,Pitzer,100,[Jonathan Pacheco Bell],,,,True,0,0.995115
3774,Egyptian Art and Archaeology,CLAS-141-PZ,"An introductory survey to the art, architectur...",Pitzer,0,[],,,,False,0,0.995103
4284,Introduction to African American Psychology,PSYC-012-AF,This course provides an introduction to Africa...,Pitzer,0,[],,,,False,0,0.994776
1227,Applied Anthropology,ANTH-102-PO,This course is designed to provide an overview...,Pomona,100,[Staff],Last offered fall 2017.,,,False,0,0.994723
895,Introduction to Film,LIT-130-CM,We will begin with a close analysis of a conte...,ClaremontMckenna,100,[Thomas Schur],Every year,,,True,0,0.994665
3468,Psychology and the Law,PSYC-162-SC,This course will survey issues in psychology a...,Scripps,100,[],,PSYC 052 .,,False,0,0.993879
3710,Introduction to Asian American Studies,ASAM-101-PZ,Introduction to the field of Asian American St...,Pitzer,100,[Rosanna Simons],,,,True,0,0.993765
4059,History of Algorithms,MATH-010Z-PZ,This course surveys the history of algorithms ...,Pitzer,0,[],,,,False,0,0.993434
