# Wrangling + Tf-Idf model

In [16]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [17]:
#data = pd.read_csv('data.csv', error_bad_lines=False, encoding="utf-8") # CSV old way
data = pd.read_json("data_5scheduler.json") # json new way

Look at the data:

In [18]:
print(data.describe(include="object"))

                title   identifier description  source instructors offered  \
count            4443         4443        4443    4443        4443    4443   
unique           3761         4234        3989       5        1231     139   
top     Senior Thesis  PHYS-178-KS              Pomona          []           
freq               56            2          95    1446        2023    1559   

       prerequisites corequisites  
count           4443         4443  
unique           716           28  
top                                
freq            3359         4411  


### Duplicates:
It looks like there is only 3989 unique course descripition so let's remove duplicates based on 'description' column.
There are also rows with empty descriptions, which are not helpful

In [19]:
print(len(data))
data = data.drop_duplicates(subset='description')
data = data.drop_duplicates(subset='identifier')
data = data[data["description"] != ""]
data.reset_index(inplace=True,drop=True)
print(len(data))
data.to_json("clean_data.json")

4443
3853


In [20]:
data.head(7)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee
0,Introduction to American Cultures,AMST-103-HM,An interdisciplinary introduction to principal...,HarveyMudd,300,[Staff],,,,False,0
1,Print and American Culture,AMST-115-HM,Covers numerous developments in American print...,HarveyMudd,300,[Anup Gampa],,,,True,0
2,Hyphenated Americans,AMST-120-HM,A focus on the experience of immigrants in the...,HarveyMudd,300,[Balseiro],,,,False,0
3,"Life: Knowledge, Belief, and Cultural Practices",ANTH-110-HM,An exploration of cultural attitudes toward li...,HarveyMudd,300,[de Laet],,,,False,0
4,Introduction to the Anthropology of Science an...,ANTH-111-HM,An introduction to science and technology as c...,HarveyMudd,300,[Marianne De Laet],,,,True,0
5,War and Conflict,ANTH-115-HM,“The wings of the butterfly—that cause the hur...,HarveyMudd,300,[de Laet],,,,False,0
6,Rationalities,ANTH-134-HM,What does it mean to be rational? Does it mean...,HarveyMudd,300,[de Laet],Offered alternate years,Any introductory course in anthropology or any...,,False,0


### Tf-Idf with scikit-learn
[Description](https://monkeylearn.com/blog/what-is-tf-idf/)

[Usage](https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y1M42ezMJhF)

Here is an example of how Tf-Idf would work if our documents were the following 4 sentences:

In [21]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(corpus)
firstv = vectors[0]
df = pd.DataFrame(firstv.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
df = df.sort_values(by=["tfidf"], ascending = False)
print("TfIdf values for the first sentence")
print(df)


TfIdf values for the first sentence
             tfidf
first     0.580286
document  0.469791
is        0.384085
the       0.384085
this      0.384085
and       0.000000
one       0.000000
second    0.000000
third     0.000000


In the example above we can see the importance of each word ranked for the first sentence `'this is the first document'`. So, for example the word `first` is important since it doesn't appear in any other document. The word `the` is not as important since it appears in all other documents. And the word `third` is not important at all since it doesn't even appear in the first document.

### Rank classes based on a given word
Function `tfidf_word(word, data)` takes in the word we are interested in and the data we are looking at. The function returns an updated dataframe with a new column `"score"` that gives each class a score of importance based on the input word. 

In [22]:
def tfidf_word(word, data_1):
    data_2 = data_1.copy() # since we don't want to be making changes to our original dataframe
    corpus = list(data_2.description)
    vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
    vectors = vectorizer.fit_transform(corpus)

    score_for_word = []
    words = vectorizer.get_feature_names()
    try:
        index = words.index(word)
    except:
        print("'" + word + "'" + " is not mentioned in any course descriptions")
        return

    for i in range(0, len(corpus)):
        value = vectors[i].T.todense()[index]
        score_for_word.append(value)

    score_for_word = [float(i) for i in score_for_word] # type cast each score to a float

    data_2["score"] = score_for_word
    return data_2

For example, let's say we are interested in ranking all of the classes based on the word `computer`:

In [23]:
tfidf_word('computer', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
129,Computer Science Seminar,CSCI-181-HM,Advanced topics of current interest in compute...,HarveyMudd,0,[Staff],Fall and Spring,Permission of instructor,,False,0,0.519609
506,Special Topics in Computer Science,CSCI-181-CM,Selected topics in computer science. May be re...,ClaremontMckenna,100,[],Occasionally,,,False,0,0.47891
1415,Computer Science Colloquium,CSCI-188-PO,Colloquium presentations and discussions of to...,Pomona,0,[Joseph C Osborn],Each semester.,"CSCI 051A PO , or CSCI 051G PO , or CSCI 051J ...",,True,0,0.443016
413,Introduction to Computational Neuroscience,BIOL-133L-KS,This course provides computational skills for ...,ClaremontMckenna,100,[],Every fall,,,False,0,0.364325
1416,Computer Science Senior Seminar,CSCI-190-PO,"Reading, discussion and presentation of resear...",Pomona,25,[Joseph C Osborn],Each semester.,Senior standing and two CSCI core courses (inc...,,True,0,0.36391
1027,Computational Physics and Engineering,PHYS-100-KS,This course is a comprehensive introduction to...,ClaremontMckenna,100,[Scot Gould],Every spring,,,True,0,0.343957
499,Fundamentals of Computer Science,CSCI-052-CM,"A solid foundation in functional programming, ...",ClaremontMckenna,100,[],Occasionally,,,False,0,0.331361
133,Computer Science Colloquium,CSCI-195-HM,Oral presentations and discussions of selected...,HarveyMudd,50,[Melissa E. O'Neill],Fall and Spring,Juniors and seniors only,,True,0,0.291461
103,Introduction to Biology and Computer Science,CSCI-005GR-HM,This course introduces fundamental concepts fr...,HarveyMudd,300,"[Wu, Bush (Biology)]",Fall,,,False,0,0.269131
131,Computer Science Research and Independent Study,CSCI-186-HM,A research or development project under comput...,HarveyMudd,100,[Staff],Fall and Spring,Permission of instructor,,True,0,0.265744


These are the first 10 instances of the classes that are most related to the word `computer` ranked in descending order (more related classes are on top). So, we could recomend a student who is interested in `computer`
 to take these classes.

Bellow are the outputs for fords `data, culture, activism, fiction, environment`

In [24]:
tfidf_word('data', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
507,Advanced Projects in Data Science,DS-180-CM,This course allows teams of students to wrestl...,ClaremontMckenna,100,[Jeho Park],Every year,,,True,0,0.511807
1303,Data Analysis and Programming for the Life Sci...,BIOL-174-PO,This course explores the analysis of big data ...,Pomona,100,[Andre Cavalcanti],Last offered spring 2019.,BIOL 040 PO and one of the following CSCI 005 ...,,True,0,0.493025
494,Foundations of Data Science,CSCI-036-CM,Data science is the interdisciplinary study of...,ClaremontMckenna,100,[Sarah Cannon],Every year,,,True,0,0.464073
258,Nonlinear Data Analytics,MATH-178-HM,Analysis of nonlinear large dynamic data inclu...,HarveyMudd,300,[Gu],Fall,CSCI070 HM and (CSCI140 HM or MATH131 HM or...,,False,0,0.406377
2848,CS1: Intro to Python and Viz,MS-059-SC,This is an introduction to computer programmin...,Scripps,0,[],,,,False,0,0.399576
561,Accounting Data Analytics,ECON-160-CM,This course will introduce students to the use...,ClaremontMckenna,100,[George Batta],Every year,,,True,0,0.388354
2596,Econometrics,ECON-125-SC,Statistical techniques for testing economic mo...,Scripps,100,[Roberto Pedace],,ECON 101 and ECON 120 .,,True,0,0.375493
107,Data Structures and Program Development,CSCI-070-HM,Abstract data types including priority queues ...,HarveyMudd,300,"[Melissa E. O'Neill, Erin Talvitie]",Fall and Spring,"(CSCI060 HM or CSCI042 HM ), and at least one...",,True,0,0.37035
2587,Introduction to Data Science,DS-002-SC,This course is the second part of a two-semest...,Scripps,100,[Doug Goodwin],,,,True,0,0.366273
504,Introduction to Data Mining,CSCI-145-CM,Data mining is the process of discovering patt...,ClaremontMckenna,100,[Charles Griffiths],Every year,,,True,0,0.355956


In [25]:
tfidf_word('culture', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
2756,Introduction to the Philosophy and History of ...,HIST-123-SC,This course will focus on some of the major wo...,Scripps,100,[],Every year,,,False,0,0.401378
2785,Introduction to the Philosophy and History of ...,HMSC-123-SC,This course will focus on some of the major wo...,Scripps,100,[],,,,False,0,0.396824
878,Film and Mass Culture,LIT-138-CM,This course will examine film as art and as me...,ClaremontMckenna,100,[],Every third year,,,False,0,0.335881
2631,Literature and Popular Culture in the Antebell...,ENGL-143S-SC,The years preceding the Civil War saw both the...,Scripps,100,[],Every other year,,,False,0,0.284453
762,Culture and Society in Weimar and Nazi Germany,HIST-139E-CM,A study of the transformation of German cultur...,ClaremontMckenna,100,[],Every other year,,,False,0,0.268526
3236,Visual Culture at the Margins,ASAM-171-PZ,This course will examine various forms of visu...,Pitzer,0,[],,,,False,0,0.267707
3572,Popular Culture,MS-125-PZ,This course will cover a broad range of histor...,Pitzer,0,[],,,,False,0,0.264758
726,Cold War America,HIST-099-CM,"The Cold War dramatically shaped the politics,...",ClaremontMckenna,100,[Lily Geismer],Occasionally,,,True,0,0.259401
741,Cold War America,HIST-118-CM,"The Cold War dramatically shaped the politics,...",ClaremontMckenna,100,[],Every other year,,,False,0,0.259401
2710,Introduction to German Culture,GERM-101-SC,Concepts of culture have long been the object ...,Scripps,100,[],,For admission to literature and culture course...,,False,0,0.255702


In [26]:
tfidf_word('activism', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
1339,"Latinx Social Movements: Identity, Power, and ...",CHST-136-CH,Latin/o/a/xs have historically used grassroots...,Pomona,100,[A. Zimmerman],Fall 2020.,,,False,0,0.512751
3156,"Political Activism, Social Movements and the P...",ANTH-138-PZ,"By examining contemporary issues, themes, and ...",Pitzer,0,[],,,,False,0,0.434119
2482,"Art, Activism, Propaganda",ARHI-184-SC,"Explores the intersection of art, political ac...",Scripps,100,[],Occasionally,,,False,0,0.328414
352,"Activism, Vocation, Justice",RLST-168-HM,The histories of social change activism are fi...,HarveyMudd,150,[Dyson],,Instructor permission,,False,0,0.296147
2283,Social and Political Movements,SOC-075-PO,Can activism from below change society and pol...,Pomona,100,[C. Beck],Last offered fall 2018.,,,False,0,0.255936
3226,Introduction to Asian American Studies,ASAM-101-PZ,Introduction to the field of Asian American St...,Pitzer,100,[Rosanna Simons],,,,True,0,0.233396
2973,Remaking the Self,POLI-173-SC,How do social movements change the world by ch...,Scripps,100,[],Occasionally,,,False,0,0.218033
2772,African American Women in the United States,HIST-171-AF,This course explores the distinctive and diver...,Scripps,100,[],,,,False,0,0.184355
2287,"Los Angeles Communities: Transformations, Ineq...",SOC-114-CH,Use of case study approach to explore the inte...,Pomona,100,"[Jeffrey D. Groves, Frank Lubbock Miller, Fran...",Last offered spring 2018.,Any course in Chicanx-Latinx Studies or Sociology,,True,0,0.182711
3330,Poetry and Public Space,ENGL-129-PZ,This workshop is focused on findings/making po...,Pitzer,0,[],,,,False,0,0.181794


In [27]:
tfidf_word('fiction', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
890,Advanced Creative Writing,LIT-181-CM,This is a class for the student who is serious...,ClaremontMckenna,100,[],Every year,,,False,0,0.479014
813,Introduction to Creative Writing,LIT-031-CM,This course offers the chance to explore three...,ClaremontMckenna,100,[],Occasionally,,,False,0,0.427437
3837,Journalism in Latin America,SPAN-162-PZ,Better than Fiction: Journalism in Latin America,Pitzer,0,[],,,,False,0,0.408026
2658,Advanced Fiction Writing Workshop,ENGL-194S-SC,This advanced fiction workshop is intended for...,Scripps,0,[],,,,False,0,0.317487
204,Fiction Workshop,LIT-035-HM,This course is designed as an introductory wor...,HarveyMudd,300,[Salvador Plascencia],Fall and Spring,,,True,0,0.306962
2657,Introduction to Fiction Writing,ENGL-193-SC,This is an introductory course on writing shor...,Scripps,100,[Leila Mansouri],,,,True,0,0.305707
587,The Francophone Caribbean,FREN-115-CM,A study of works of writers and artists from H...,ClaremontMckenna,100,[],Occasionally,,,False,0,0.297736
892,Advanced Fiction Writing,LIT-183-CM,This advanced fiction workshop is intended for...,ClaremontMckenna,100,[Mary Gaitskill],Every year,,,True,0,0.284325
2254,Post-Soviet Russian Culture and Society,RUSS-182-PO,The course explores the major changes in Russi...,Pomona,100,[Larissa V. Rudova],Spring 2022.,RUSS 044 PO,,True,0,0.274841
3410,"Diversity, Equity, and Inequities",FS-024-PZ,This course will examine questions surrounding...,Pitzer,0,[],,,,False,0,0.250013


In [28]:
tfidf_word('environment', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
1693,Food and the Environment in Asia and the Pacific,HIST-101F-PO,A single question inspired this seminar: what ...,Pomona,100,[S. Yamashita],Last offered spring 2018.,,,False,0,0.401468
1016,Environmental Ethics,PHIL-187-CM,An exploration of human beings’ ethical relati...,ClaremontMckenna,100,[],Occasionally,,,False,0,0.301989
442,Microbiology,BIOL-168L-KS,In this fundamental microbiology course we wil...,ClaremontMckenna,100,[Pete Chandrangsu],Occasionally,,,True,0,0.290268
3379,Critical Environmental Analysis,EA-150-PZ,A seminar examination of how environmental iss...,Pitzer,0,[],,,,False,0,0.266208
2304,"Africa, the Environment, and the Global Economy",SOC-189H-PO,"Drawing on sociology and related disciplines, ...",Pomona,100,[S. Stefanos],Each fall.,,,False,0,0.237898
3135,Global Environmental Conflict,ANTH-082-PZ,This class uses the tools of anthropology and ...,Pitzer,0,[],,,,False,0,0.235211
1014,Topics in Aesthetics,PHIL-184-CM,Aesthetics is the philosophical study of art a...,ClaremontMckenna,100,[],Occasionally,,,False,0,0.225723
3362,Urban Ecology,EA-098-PZ,Urban ecology is a subfield of ecology that de...,Pitzer,100,[Heather Campbell],,,,True,0,0.221874
3121,Native Americans and Their Environments,ANTH-012-PZ,This course will investigate the traditional i...,Pitzer,100,[Sheryl Miller],,,,True,0,0.216429
314,Topics in Physics,PHYS-080-HM,"An area of physics is studied, together with i...",HarveyMudd,300,"[Donnelly, Saeta]",,PHYS051 HM,,False,0,0.211641


In [29]:
tfidf_word('pomona', data).sort_values(by=["score"], ascending = False).head(10)

Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
1139,Interpreting Religious Worlds,RLST-180-CM,Examines current theoretical and methodologica...,ClaremontMckenna,100,[Jamel A. Velji],Occasionally,,,True,0,0.262794
925,Math Analysis II,MATH-132-CM,A rigorous study of calculus in Euclidean Spac...,ClaremontMckenna,100,[],Every other year,,,False,0,0.220281
1152,Introduction to Latin American Literature and ...,SPAN-125A-CM,A survey of the major events and texts of Lati...,ClaremontMckenna,100,[],Every year,,,False,0,0.211064
946,Number Theory,MATH-175-CM,"Properties of integers, congruences, Diophanti...",ClaremontMckenna,100,[Jodi Day],Every other year,,,True,0,0.20898
2234,Interpreting Religious Worlds,RLST-180-PO,Required for all majors and minors. Examines s...,Pomona,100,[Staff],Each spring by rotation at the Claremont Colle...,,,False,0,0.203686
943,Abstract Algebra,MATH-171-CM,An introduction to the fundamental structures ...,ClaremontMckenna,100,[],Every spring,,,False,0,0.200718
2370,Lifting Silent Narratives,THEA-062-PO,Students in this course will investigate the h...,Pomona,100,[Rose M. Portillo],Each semester. Spring 2022.,,,True,0,0.200335
3267,Senior Thesis,CLAS-1-91,Students write a senior thesis may do so under...,Pitzer,0,[],,,,False,0,0.195023
1830,Combinatorial Mathematics,MATH-103-PO,An introduction to the techniques and ideas of...,Pomona,100,[Shahriar Shahriari],Each semester.,MATH 060 PO,,True,0,0.190115
1235,Junior/Senior Art Major Seminar,ART-190-PO,"For Pomona art majors, to be taken in the fall...",Pomona,100,[Staff],Each fall.,,,False,0,0.189125


### Rank classes based on a given class
`tfidf_id(id, data)` will take in a identifier of some course (i.e. "PHIL-187-CM") and return courses that are most similar to the givern course. To do it, we will look at which words ranked highest for the given class and find other classes where same word also ranked highest.

Similarly to `tfidf_word`, we will add new columns that have the score of each class based on each word. In thins case we will have multiple new columns that have similarity scores for different word that were important in our input class.

Additionally, if `tfidf_id` is used with `penalize=True` the courses withing the same department will be penalized

In [30]:
def tfidf_id(id, data_, penalty=1.0):
    index_id = list(data_.index[data_["identifier"] == id]) # index of input class
    
    if len(index_id) == 0:
        print("Couldn't find a course " + id)
        return
    else:
        index_id = index_id[0]
    #print(data_.loc[index_id, "identifier"])
    #print(data_.loc[index_id, "description"])

    corpus = list(data_.description)
    vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
    vectors = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names()

    scores_id = vectors[index_id].T.todense() # score values for our given class
    scores_id = [float(i) for i in scores_id] # type cast to float
    score_for_word = {}

    
    for i in range(0, len(words)):
        if scores_id[i] > 0.2: # We will count a word as relevant if it's score if more than 0.2 (arbitrary value, subject to change)
            score_for_word[words[i]] = [scores_id[i], i]
    score_for_word = {k: v for k, v in sorted(score_for_word.items(), key=lambda item: item[1][0], reverse=True)} #ordear dict by descending vals
    #print("Most important words in course "+ id + " and their scores and indices")
    #print(score_for_word)

    word = list(score_for_word.keys())[0]
    print("Giving course recomendations based of the word: " + word)

    data__ = data_.copy() ## since we're making changes to our dataframe, we don't want to save these changes in the original dataframe
    data__ = tfidf_word(word, data__)


    if penalty!=1.0:
        department = id[:id.index("-")]
        print("penalizing " + department +" courses by a factor of " + str(penalty))
        
        for index in data__.index:
            if data__["identifier"][index] == id:
                data__["score"][index] = 0.0
            elif department in data__["identifier"][index]:
                data__["score"][index] = data__["score"][index]*penalty
    else:
        print("No penalty")

    return data__

In [31]:
tfidf_id("AMST-120-HM", data).sort_values(by=["score"], ascending = False).head(5)

Giving course recomendations based of the word: immigrants
No penalty


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
2129,The Politics of Immigration and Citizenship,POLI-046-PO,Examines immigration and citizenship politics ...,Pomona,100,[Staff],Last offered spring 2019.,,,False,0,0.341082
2,Hyphenated Americans,AMST-120-HM,A focus on the experience of immigrants in the...,HarveyMudd,300,[Balseiro],,,,False,0,0.296387
2808,Italians as Guests and Hosts: Intercultural En...,ITAL-136-SC,This course examines the phenomenon of exchang...,Scripps,100,[],,ITAL 044 or equivalent.,,False,0,0.270244
1151,Images of Immigration in Spanish Literature an...,SPAN-122-CM,"From an interdisciplinary perspective, this co...",ClaremontMckenna,100,[],Every other year,,,False,0,0.222157
3402,Criminalization of Immigrants,FS-016-PZ,How did immigration and the U.S. - Mexico bord...,Pitzer,100,[Steffanie Guillermo],,,,True,0,0.193295


In [32]:
tfidf_id("AMST-120-HM", data, penalty=0.5).sort_values(by=["score"], ascending = False).head(5)

Giving course recomendations based of the word: immigrants
penalizing AMST courses by a factor of 0.5


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
2129,The Politics of Immigration and Citizenship,POLI-046-PO,Examines immigration and citizenship politics ...,Pomona,100,[Staff],Last offered spring 2019.,,,False,0,0.341082
2808,Italians as Guests and Hosts: Intercultural En...,ITAL-136-SC,This course examines the phenomenon of exchang...,Scripps,100,[],,ITAL 044 or equivalent.,,False,0,0.270244
1151,Images of Immigration in Spanish Literature an...,SPAN-122-CM,"From an interdisciplinary perspective, this co...",ClaremontMckenna,100,[],Every other year,,,False,0,0.222157
3402,Criminalization of Immigrants,FS-016-PZ,How did immigration and the U.S. - Mexico bord...,Pitzer,100,[Steffanie Guillermo],,,,True,0,0.193295
3695,US Immigration and Transnational Politics,POST-174-CH,Examines the factors shaping the size and comp...,Pitzer,0,[],,,,False,0,0.177832


In [33]:
tfidf_id("CSCI-036-CM", data).sort_values(by=["score"], ascending = False).head(5)

Giving course recomendations based of the word: data
No penalty


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
507,Advanced Projects in Data Science,DS-180-CM,This course allows teams of students to wrestl...,ClaremontMckenna,100,[Jeho Park],Every year,,,True,0,0.511807
1303,Data Analysis and Programming for the Life Sci...,BIOL-174-PO,This course explores the analysis of big data ...,Pomona,100,[Andre Cavalcanti],Last offered spring 2019.,BIOL 040 PO and one of the following CSCI 005 ...,,True,0,0.493025
494,Foundations of Data Science,CSCI-036-CM,Data science is the interdisciplinary study of...,ClaremontMckenna,100,[Sarah Cannon],Every year,,,True,0,0.464073
258,Nonlinear Data Analytics,MATH-178-HM,Analysis of nonlinear large dynamic data inclu...,HarveyMudd,300,[Gu],Fall,CSCI070 HM and (CSCI140 HM or MATH131 HM or...,,False,0,0.406377
2848,CS1: Intro to Python and Viz,MS-059-SC,This is an introduction to computer programmin...,Scripps,0,[],,,,False,0,0.399576


In [34]:
tfidf_id("CSCI-036-CM", data, penalty=0.5).sort_values(by=["score"], ascending = False).head(5)

Giving course recomendations based of the word: data
penalizing CSCI courses by a factor of 0.5


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
507,Advanced Projects in Data Science,DS-180-CM,This course allows teams of students to wrestl...,ClaremontMckenna,100,[Jeho Park],Every year,,,True,0,0.511807
1303,Data Analysis and Programming for the Life Sci...,BIOL-174-PO,This course explores the analysis of big data ...,Pomona,100,[Andre Cavalcanti],Last offered spring 2019.,BIOL 040 PO and one of the following CSCI 005 ...,,True,0,0.493025
258,Nonlinear Data Analytics,MATH-178-HM,Analysis of nonlinear large dynamic data inclu...,HarveyMudd,300,[Gu],Fall,CSCI070 HM and (CSCI140 HM or MATH131 HM or...,,False,0,0.406377
2848,CS1: Intro to Python and Viz,MS-059-SC,This is an introduction to computer programmin...,Scripps,0,[],,,,False,0,0.399576
561,Accounting Data Analytics,ECON-160-CM,This course will introduce students to the use...,ClaremontMckenna,100,[George Batta],Every year,,,True,0,0.388354


In [35]:
tfidf_id("LIT-138-CM", data).sort_values(by=["score"], ascending = False).head(5)

Giving course recomendations based of the word: film
No penalty


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
874,Special Studies in Film,LIT-134-CM,A seminar designed to explore the aesthetic ac...,ClaremontMckenna,100,[Robert von Hallberg],Every other year,,,True,0,0.585039
870,Introduction to Film,LIT-130-CM,We will begin with a close analysis of a conte...,ClaremontMckenna,100,[Thomas Schur],Every year,,,True,0,0.544105
878,Film and Mass Culture,LIT-138-CM,This course will examine film as art and as me...,ClaremontMckenna,100,[],Every third year,,,False,0,0.502995
285,Film Music,MUS-067-HM,An exploration of the history and aesthetics o...,HarveyMudd,300,[Alves],,,,False,0,0.395797
1875,Film Theory,MS-148G-PO,This course develops theoretical approaches to...,Pomona,100,[T. Connelly],Spring 2019.,"MS 049 PO , MS 050 PO , or MS 051 PO",,False,0,0.371952


In [36]:
tfidf_id("LIT-138-CM", data, penalty=0.5).sort_values(by=["score"], ascending = False).head(5)

Giving course recomendations based of the word: film
penalizing LIT courses by a factor of 0.5


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
285,Film Music,MUS-067-HM,An exploration of the history and aesthetics o...,HarveyMudd,300,[Alves],,,,False,0,0.395797
1875,Film Theory,MS-148G-PO,This course develops theoretical approaches to...,Pomona,100,[T. Connelly],Spring 2019.,"MS 049 PO , MS 050 PO , or MS 051 PO",,False,0,0.371952
3740,Cognitive Film Studies,PSYC-128-PZ,This course examines the moving image from the...,Pitzer,0,[],,,,False,0,0.367141
3564,Film Sound,MS-114-PZ,An intermediate level media history and theory...,Pitzer,0,[],,,,False,0,0.360732
1204,Visual Anthropology,ANTH-189P-PO,This course focuses on the history and develop...,Pomona,100,[Joanne Randa Nucho],Last offered fall 2018.,,,True,0,0.352444


# Cosine Similarities

In [37]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
def cos_sim(data, id, penalty=1.0):
    corpus = list(data.description)
    vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
    vectors = vectorizer.fit_transform(corpus)
    cosine_sim = linear_kernel(vectors, vectors)

    index_id = list(data.index[data["identifier"] == id]) # indexes of input classes
    if len(index_id) == 0:
        print("Couldn't find courses")
        return
    else:
        index_id = index_id[0]
    
    data_out = data.copy()
    data_out["score"] = cosine_sim[index_id]

    if penalty!=1.0:
        department = id[:id.index("-")]
        print("penalizing " + department +" courses by a factor of " + str(penalty))
        
        for index in data_out.index:
            if data_out["identifier"][index] == id:
                data_out["score"][index] = 0.0
            elif department in data_out["identifier"][index]:
                data_out["score"][index] = data_out["score"][index]*penalty
    else:
        print("No penalty")

    return data_out




In [38]:
cos_sim(data, "AMST-120-HM").sort_values(by=["score"], ascending = False).head(5)

No penalty


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
2,Hyphenated Americans,AMST-120-HM,A focus on the experience of immigrants in the...,HarveyMudd,300,[Balseiro],,,,False,0,1.0
2971,Marginalized Communities,POLI-155-SC,"This course explores definitions, mechanisms, ...",Scripps,0,[],,,,False,0,0.185035
289,Music in the United States,MUS-118-SC,A survey of the history and development of mus...,Scripps,100,[Charles W. Kamm],,,,True,0,0.177216
14,Introduction to Asian American History: 1850-P...,ASAM-125-AA,This survey course examines the history of Asi...,HarveyMudd,300,[Flores],,,,False,0,0.173544
2739,African Diaspora in the United States to 1877,HIST-050A-AF,This course examines the diverse and complex e...,Scripps,100,[],,,,False,0,0.165463


In [39]:
cos_sim(data, "CSCI-036-CM").sort_values(by=["score"], ascending = False).head(5)

No penalty


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
494,Foundations of Data Science,CSCI-036-CM,Data science is the interdisciplinary study of...,ClaremontMckenna,100,[Sarah Cannon],Every year,,,True,0,1.0
3028,Data Science Ethics and Justice,PSYC-183-SC,Data science is a set of interdisciplinary app...,Scripps,100,[Michael L. Spezio],Every spring,,,True,0,0.319625
507,Advanced Projects in Data Science,DS-180-CM,This course allows teams of students to wrestl...,ClaremontMckenna,100,[Jeho Park],Every year,,,True,0,0.311481
1303,Data Analysis and Programming for the Life Sci...,BIOL-174-PO,This course explores the analysis of big data ...,Pomona,100,[Andre Cavalcanti],Last offered spring 2019.,BIOL 040 PO and one of the following CSCI 005 ...,,True,0,0.274248
2848,CS1: Intro to Python and Viz,MS-059-SC,This is an introduction to computer programmin...,Scripps,0,[],,,,False,0,0.253766


In [40]:
cos_sim(data, "CSCI-036-CM", penalty=0.5).sort_values(by=["score"], ascending = False).head(5)

penalizing CSCI courses by a factor of 0.5


Unnamed: 0,title,identifier,description,source,credits,instructors,offered,prerequisites,corequisites,currently_offered,fee,score
3028,Data Science Ethics and Justice,PSYC-183-SC,Data science is a set of interdisciplinary app...,Scripps,100,[Michael L. Spezio],Every spring,,,True,0,0.319625
507,Advanced Projects in Data Science,DS-180-CM,This course allows teams of students to wrestl...,ClaremontMckenna,100,[Jeho Park],Every year,,,True,0,0.311481
1303,Data Analysis and Programming for the Life Sci...,BIOL-174-PO,This course explores the analysis of big data ...,Pomona,100,[Andre Cavalcanti],Last offered spring 2019.,BIOL 040 PO and one of the following CSCI 005 ...,,True,0,0.274248
2848,CS1: Intro to Python and Viz,MS-059-SC,This is an introduction to computer programmin...,Scripps,0,[],,,,False,0,0.253766
941,Introduction to Data Mining,MATH-166-CM,Data mining is the process of discovering patt...,ClaremontMckenna,100,[],Every year,,,False,0,0.242114


## Cosine Similarities for the list of classes

In [41]:
def cos_sim_list(data, ids, penalty=1.0):
    out = []
    for id in ids:
        out.append(list(cos_sim(data, id, penalty).sort_values(by=["score"], ascending = False).head(5).identifier))

    flat = []
    for i in range(len(out[0])):
        for j in out:
            flat.append(j[i])
    return flat

In [42]:
cos_sim_list(data,["CSCI-036-CM", "AMST-120-HM", "LIT-138-CM"], penalty=0.5)

penalizing CSCI courses by a factor of 0.5
penalizing AMST courses by a factor of 0.5
penalizing LIT courses by a factor of 0.5


['PSYC-183-SC',
 'POLI-155-SC',
 'MS-172-HM',
 'DS-180-CM',
 'MUS-118-SC',
 'MUS-067-HM',
 'BIOL-174-PO',
 'ASAM-125-AA',
 'HIST-139E-CM',
 'MS-059-SC',
 'HIST-050A-AF',
 'MS-114-PZ',
 'MATH-166-CM',
 'CHST-028-CH',
 'ANTH-189P-PO']

# Tasks:
- come up with the validation data set
- visit pomonastudents.org
- Look at Glove

The following code writes a csv file of nodes and scores between them for our graph

In [43]:
import csv
def build_nodes():
    corpus = list(data.description)
    vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
    vectors = vectorizer.fit_transform(corpus)
    cosine_sim = linear_kernel(vectors, vectors)
    cosine_sim = cosine_sim.round(4)
    length = len(cosine_sim)
    out = []
    for i in range(length):
        for j in range(i+1, length):
            node1 = data.loc[i, "identifier"]
            node2 = data.loc[j, "identifier"]
            score = cosine_sim[i][j]
            if score < 0.05:
                score = 0.0
            out.append([node1, node2, score])

    with open("nodes.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(["node1", "node2", "score"])
        writer.writerows(out)

#build_nodes()