In [23]:
#import griddb_python as griddb
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [4]:
df = pd.read_csv("526_data.csv")

In [13]:
# Convert to list
data = df.comments.values.tolist()
data

['This class is hard, but its a two-in-one gen-ed knockout, and the content is very stimulating. Unlike most classes, you have to actually participate to pass. Sections are easy and offer extra credit every week. Very funny dude. Not much more I can say.',
 "Definitely going to choose Prof. Looney\\'s class again! Interesting class and easy A. You can bring notes to exams so you don\\'t need to remember a lot. Lots of bonus points available and the observatory sessions are awesome!",
 "I overall enjoyed this class because the assignments were straightforward and interesting. I just didn\\'t enjoy the video project because I felt like no one in my group cared enough to help.",
 "Yes, it\\'s possible to get an A but you\\'ll definitely have to work for it. The content is pretty interesting, but you have tog get super organized in this class. You\\'ll have multiple things due every week and a ton lectures to go over. If possible, I\\'d avoid this class as an 8 week course. You\\'ll defini

### Data Cleaning

In [14]:
#Lowercase
data = df['comments'].apply(lambda x: str(x).lower())

In [15]:
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]

In [16]:
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[:1])

['this class is hard, but its a two-in-one gen-ed knockout, and the content is '
 'very stimulating. unlike most classes, you have to actually participate to '
 'pass. sections are easy and offer extra credit every week. very funny dude. '
 'not much more i can say.']


### Tokenization and Lemmatization

In [17]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

data_words = list(sent_to_words(data))
print(data_words[:1])

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "VERB"]) #select noun and verb
print(data_lemmatized[:2])

[['this', 'class', 'is', 'hard', 'but', 'its', 'two', 'in', 'one', 'gen', 'ed', 'knockout', 'and', 'the', 'content', 'is', 'very', 'stimulating', 'unlike', 'most', 'classes', 'you', 'have', 'to', 'actually', 'participate', 'to', 'pass', 'sections', 'are', 'easy', 'and', 'offer', 'extra', 'credit', 'every', 'week', 'very', 'funny', 'dude', 'not', 'much', 'more', 'can', 'say']]
['class knockout content class participate pass section offer credit week dude say', 'go choose class class bring note exam need remember lot lot bonus point observatory session']


#### The LDA topic model algorithm requires a document word matrix which is done using CountVectorizer.

In [18]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}') 
data_vectorized = vectorizer.fit_transform(data_lemmatized)

#### Initializing and Constructing LDA Model

In [19]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,max_iter=10,learning_method='online',random_state=100,batch_size=128,evaluate_every = -1,n_jobs = -1,               )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [20]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
 evaluate_every=-1, learning_decay=0.7,
 learning_method="online", learning_offset=10.0,
 max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
 n_components=10, n_jobs=-1, perp_tol=0.1,
 random_state=100, topic_word_prior=None,
 total_samples=1000000.0, verbose=0)

### Diagnose model performance with perplexity and log-likelihood.
##### High log-likelihood and low perplexity (exp(-1. * log-likelihood per word)) are considered good models.

In [21]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -1162703.6952268006
Perplexity:  451.25384140939383
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


#### Using Grid Search to determine the best LDA model

In [24]:
# Define Search Param
search_params = {'n_components': [10, 20], 'learning_decay': [0.5, 0.9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
        n_jobs=1,
       param_grid={'n_components': [10, 20], 'learning_decay': [0.5, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -235154.68731508605
Model Perplexity:  383.09946120507544


#### A logical way to determine whether a document belongs to a particular topic is to see which topic contributed the most to it and then assign it to that topic. Below table highlighted all major topics and assigned the most dominant topic its own column.

In [26]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Styling
def color_green(val):
 color = "green" if val > .1 else "black"
 return "color: {col}".format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.01,0.01,0.01,0.1,0.01,0.73,0.11,0.01,0.01,0.01,5
Doc1,0.01,0.29,0.01,0.01,0.19,0.47,0.01,0.01,0.01,0.01,5
Doc2,0.01,0.01,0.48,0.01,0.01,0.01,0.32,0.01,0.01,0.14,2
Doc3,0.01,0.01,0.01,0.39,0.38,0.01,0.01,0.08,0.01,0.1,3
Doc4,0.01,0.39,0.24,0.01,0.01,0.07,0.26,0.01,0.01,0.01,1
Doc5,0.18,0.72,0.01,0.01,0.01,0.07,0.01,0.01,0.01,0.01,1
Doc6,0.07,0.2,0.29,0.01,0.01,0.14,0.01,0.01,0.21,0.08,2
Doc7,0.01,0.01,0.36,0.01,0.01,0.16,0.01,0.4,0.01,0.01,7
Doc8,0.01,0.01,0.01,0.01,0.01,0.87,0.01,0.01,0.01,0.01,5
Doc9,0.11,0.41,0.01,0.31,0.01,0.01,0.01,0.01,0.01,0.11,1


In [27]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,ability,absence,absorb,accent,accept,access,accommodate,accomplish,accord,account,...,worth,wouldn,write,writer,writing,wrong,year,yell,youtube,zone
Topic0,0.100023,0.10004,0.100042,0.100014,0.100013,0.10012,0.100004,0.100001,0.100013,0.100003,...,0.100002,0.100012,0.100025,0.10002,0.10001,0.100071,479.044246,0.100029,0.100037,0.10001
Topic1,0.100008,0.100044,0.100001,0.100016,0.10002,9.540113,0.100042,28.495131,0.100024,0.100007,...,0.100008,0.10003,537.785069,0.10001,0.10001,0.100034,0.100011,51.742354,33.066278,0.100006
Topic2,0.100023,0.10002,0.100009,0.100039,0.100008,0.10003,0.100014,0.100224,0.100017,19.223432,...,0.100005,0.100017,0.100024,0.100009,0.100007,0.100007,0.100016,0.100049,0.100054,0.100001
Topic3,0.10001,0.100029,13.228832,0.100014,0.100011,0.100044,0.100107,0.1,32.736337,0.100007,...,0.100003,34.087042,0.100026,0.100007,0.100009,0.100038,0.100012,0.100114,0.10001,6.448002
Topic4,0.100006,0.100028,0.100011,0.100011,0.100012,0.100001,0.100047,0.100461,0.100034,0.100007,...,0.100008,0.100024,398.893371,0.10004,0.100009,0.100052,0.100012,0.100066,0.100062,0.100005


In [28]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,guy,come,year,college,require,way,mind,minute,day,provide,suggest,stay,engage,favorite,board
Topic 1,class,test,question,study,book,exam,read,time,grade,quiz,teacher,answer,ask,help,homework
Topic 2,class,make,understand,try,math,professor,fun,help,student,teach,feel,material,review,time,explain
Topic 3,comment,thing,lab,people,textbook,avoid,say,concept,job,world,hear,turn,listen,teach,self
Topic 4,class,learn,know,teach,lot,professor,make,think,talk,like,work,time,want,subject,love
Topic 5,note,prof,class,week,say,person,offer,prepare,test,reason,science,regret,pick,pass,meet
Topic 6,class,work,grade,reading,lot,paper,fail,discussion,professor,assignment,grader,school,tell,man,assign
Topic 7,lecture,problem,bore,word,summer,experience,understanding,research,story,mark,lecturer,teach,accent,lady,time
Topic 8,teacher,class,love,student,teaching,learn,recommend,history,level,style,help,major,want,art,enjoy
Topic 9,student,professor,course,care,help,need,want,miss,instructor,hand,attend,wish,time,life,work


In [37]:
Topics = [" Punctual, Engaging Class and Helpful Professor.",
 "Lots of Class test/exams,teacher helps with homework when asked.",
 "Helpful professor.Fun learning class.Provides enough review/study material.",
 "Does not listen to the world. Avoid this teacher.", 
 "Professor loves his subject. Makes time to talk after class. Lot of knowledge from work experience.",
 "Regret taking this professor's class. Weekly tests makes it difficult to pass his class.",
 "Lot of paper reading and assigns grade for that. Students in his class fail.",
 "Tough time understanding accent. Boring lectures about research.",
 "Professor recommended. Enjoyed his class and teaching style.",
 "Professor takes care of all students. Always attend, never miss. Coursework helpful."]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Topics
Topic 0,guy,come,year,college,require,way,mind,minute,day,provide,suggest,stay,engage,favorite,board,"Punctual, Engaging Class and Helpful Professor."
Topic 1,class,test,question,study,book,exam,read,time,grade,quiz,teacher,answer,ask,help,homework,"Lots of Class test/exams,teacher helps with ho..."
Topic 2,class,make,understand,try,math,professor,fun,help,student,teach,feel,material,review,time,explain,Helpful professor.Fun learning class.Provides ...
Topic 3,comment,thing,lab,people,textbook,avoid,say,concept,job,world,hear,turn,listen,teach,self,Does not listen to the world. Avoid this teacher.
Topic 4,class,learn,know,teach,lot,professor,make,think,talk,like,work,time,want,subject,love,Professor loves his subject. Makes time to tal...
Topic 5,note,prof,class,week,say,person,offer,prepare,test,reason,science,regret,pick,pass,meet,Regret taking this professor's class. Weekly t...
Topic 6,class,work,grade,reading,lot,paper,fail,discussion,professor,assignment,grader,school,tell,man,assign,Lot of paper reading and assigns grade for tha...
Topic 7,lecture,problem,bore,word,summer,experience,understanding,research,story,mark,lecturer,teach,accent,lady,time,Tough time understanding accent. Boring lectur...
Topic 8,teacher,class,love,student,teaching,learn,recommend,history,level,style,help,major,want,art,enjoy,Professor recommended. Enjoyed his class and t...
Topic 9,student,professor,course,care,help,need,want,miss,instructor,hand,attend,wish,time,life,work,Professor takes care of all students. Always a...


In [38]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
# Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))
# Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)
# Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    
    # Step 5: Infer Topic
    infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
    
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return infer_topic, topic, topic_probability_scores
# Predict the topic
mytext = ["Recommended Professor.Good teaching resources.Lots of knowledge about courses and shares experiences from work life"]
infer_topic, topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(infer_topic)

['professor', 'course', 'care', 'help', 'need', 'want', 'miss', 'instructor', 'hand', 'attend', 'wish', 'time', 'life']
Professor takes care of all students. Always attend, never miss. Coursework helpful.


In [39]:
def apply_predict_topic(text):
    text = [text]
    infer_topic, topic, prob_scores = predict_topic(text = text)
    return(infer_topic)
df["Topic_key_word"]= df['comments'].apply(apply_predict_topic)

In [40]:
df

Unnamed: 0,professor_name,school_name,department_name,local_name,state_name,year_since_first_review,star_rating,take_again,diff_index,tag_professor,...,accessible_outside_class,lecture_heavy,extra_credit,graded_by_few_things,group_projects,test_heavy,so_many_papers,beware_of_pop_quizzes,IsCourseOnline,Topic_key_word
0,Leslie Looney,University Of Illinois at Urbana-Champaign,Astronomy department,Champaign\xe2\x80\x93Urbana,IL,11.0,4.7,,2.0,Hilarious (2) GROUP PROJECTS (2) Gives good ...,...,0,0,0,0,1,0,0,0,0,Regret taking this professor's class. Weekly t...
1,Leslie Looney,University Of Illinois at Urbana-Champaign,Astronomy department,Champaign\xe2\x80\x93Urbana,IL,11.0,4.7,,2.0,Hilarious (2) GROUP PROJECTS (2) Gives good ...,...,0,0,0,0,1,0,0,0,0,Regret taking this professor's class. Weekly t...
2,Leslie Looney,University Of Illinois at Urbana-Champaign,Astronomy department,Champaign\xe2\x80\x93Urbana,IL,11.0,4.7,,2.0,Hilarious (2) GROUP PROJECTS (2) Gives good ...,...,0,0,0,0,1,0,0,0,0,Helpful professor.Fun learning class.Provides ...
3,Leslie Looney,University Of Illinois at Urbana-Champaign,Astronomy department,Champaign\xe2\x80\x93Urbana,IL,11.0,4.7,,2.0,Hilarious (2) GROUP PROJECTS (2) Gives good ...,...,0,0,0,0,1,0,0,0,0,Does not listen to the world. Avoid this teacher.
4,Leslie Looney,University Of Illinois at Urbana-Champaign,Astronomy department,Champaign\xe2\x80\x93Urbana,IL,11.0,4.7,,2.0,Hilarious (2) GROUP PROJECTS (2) Gives good ...,...,0,0,0,0,1,0,0,0,0,"Lots of Class test/exams,teacher helps with ho..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,Dennis Richardson,Quinnipiac University,Biology department,Hamden,CT,15.0,4.0,,3.5,BEWARE OF POP QUIZZES (4) Amazing lectures (3...,...,1,0,1,1,0,0,0,1,0,Helpful professor.Fun learning class.Provides ...
19996,Dennis Richardson,Quinnipiac University,Biology department,Hamden,CT,15.0,4.0,,3.5,BEWARE OF POP QUIZZES (4) Amazing lectures (3...,...,1,0,1,1,0,0,0,1,0,"Lots of Class test/exams,teacher helps with ho..."
19997,Dennis Richardson,Quinnipiac University,Biology department,Hamden,CT,15.0,4.0,,3.5,BEWARE OF POP QUIZZES (4) Amazing lectures (3...,...,1,0,1,1,0,0,0,1,0,Lot of paper reading and assigns grade for tha...
19998,Dennis Richardson,Quinnipiac University,Biology department,Hamden,CT,15.0,4.0,,3.5,BEWARE OF POP QUIZZES (4) Amazing lectures (3...,...,1,0,1,1,0,0,0,1,0,Professor loves his subject. Makes time to tal...


In [41]:
df.to_csv('prof#keywd.csv')