In [348]:
%pip install pandas numpy seaborn matplotlib 

Note: you may need to restart the kernel to use updated packages.


In [349]:
# Importing stuff
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [350]:
# Read the data
# CourseInfo = pd.read_csv("./Course_info.csv") # NOT USED
Comments = pd.read_csv("./Comments.csv")

In [351]:
# Information about the comments
Comments.head()

Unnamed: 0,id,course_id,rate,date,display_name,comment
0,88962892,3173036,1.0,2021-06-29T18:54:25-07:00,Rahul,I think a beginner needs more than you think.\...
1,125535470,4913148,5.0,2022-10-07T11:17:41-07:00,Marlo,Aviva is such a natural teacher and healer/hea...
2,68767147,3178386,3.5,2020-10-19T06:35:37-07:00,Yamila Andrea,Muy buena la introducción para entender la bas...
3,125029758,3175814,5.0,2022-09-30T21:13:49-07:00,Jacqueline,This course is the best on Udemy. This breakd...
4,76584052,3174896,4.5,2021-01-30T08:45:11-08:00,Anthony,I found this course very helpful. It was full ...


In [352]:
# Type information
print(Comments.dtypes)

id                int64
course_id         int64
rate            float64
date             object
display_name     object
comment          object
dtype: object


In [353]:
# Importing NLTK to analyze the comments
%pip install nltk 

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Note: you may need to restart the kernel to use updated packages.


In [354]:

# Stopwords are words that are filtered out before or after processing of natural language data, like "a", "the", "is", etc.
nltk.download('stopwords')
# WordNet is a lexical database for the English language
nltk.download('wordnet')
# Define stopwords and lemmatizer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fuerte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/fuerte/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [355]:
%pip install scikit-learn
from sklearn.model_selection import train_test_split

Note: you may need to restart the kernel to use updated packages.


In [356]:
# Here I'm splitting the data into a smaller set to be able to train in less than 2 hours
Comments_big, Comments_small = train_test_split(Comments, test_size=0.1, random_state=666)

In [357]:
# Here we can select if we want to use the big or small set, 
# The small set is used by default, and it takes around 1h30m to train in my M1 machinhe, due to cleaning
# I estimate the large set (commented) takes 10h to 15h to train completely
TreatedComments = Comments_small
# TreatedComments = Comments
print(TreatedComments.shape)

(941173, 6)


In [358]:
# A function to add a new column to the dataframe with the cleaned comments
# It either returns 
#     the language (en, es, etc) or 
#     'exception' if it fails to detect the language or 
#     'too short' if the comment is too short

%pip install langdetect
from langdetect import detect, LangDetectException

def detect_language(text):
    try:
        if len(str(text).strip()) > 10:
            lang = detect(text)
        else:
            lang = 'too short'
    except LangDetectException:
        lang = 'exception'
    return lang

Note: you may need to restart the kernel to use updated packages.


In [359]:
# Applying the function to the dataframe
# THIS TOOK 1h10m IN MY M1 PRO WITH THE SMALL SET, IT TAKES TIME!
TreatedComments['lang'] = TreatedComments['comment'].apply(detect_language)

In [360]:
# Filter the comments that are not in english
TreatedComments = TreatedComments[TreatedComments['lang'] == 'en']
TreatedComments.head(10)

Unnamed: 0,id,course_id,rate,date,display_name,comment,lang
2250664,4205214,943620,5.0,2016-11-21T08:44:07-08:00,Ariel,The instruction is very clear. Well done!,en
9237164,34636138,1362070,4.0,2019-07-22T22:10:52-07:00,Beto,Max explanations are really good. \nBut the co...,en
5511903,10763216,1399390,5.0,2017-11-05T05:40:51-08:00,Geoff@Balfre.Com,Another excellent course from Brad,en
7154330,327502,130064,5.0,2015-03-26T17:21:51-07:00,PC,This course can benefit everybody who wants to...,en
8962739,33829950,437398,5.0,2019-07-05T10:28:56-07:00,Adebayo,"This course is a great, with clear explanation...",en
1033103,40112730,461740,4.0,2019-11-10T21:26:32-08:00,Sheri,It's a little outdated as WP is now on version...,en
7761210,32943754,1035472,4.0,2019-06-16T03:15:44-07:00,Daniele,I've just completed the course.\n\nPRO:\n- wel...,en
7193617,117673900,1299614,5.0,2022-06-30T09:36:56-07:00,Ladouce,Great lesson! Listening improves and encourage...,en
6300821,100627494,2327564,5.0,2021-11-20T04:49:56-08:00,Dr,Wow!! Amazing course which is on English Gramm...,en
1654031,71682836,2706042,3.5,2020-11-30T12:10:30-08:00,Omri,need to explain better what's each parameter w...,en


In [361]:
# A pre processing function to clean the comments of stopwords and lemmatize the words
def preprocess_text(text):
    # remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    # convert to lowercase
    text = text.lower()
    # split into words
    words = text.split()
    # remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # join the words back into a string
    text = ' '.join(words)
    return text

In [362]:
# Applying the function to the dataframe
TreatedComments['lemmacomment'] = TreatedComments['comment'].apply(str).apply(preprocess_text)

In [363]:
# Defining the X and Y variables for the model, we make sure to remove the NaN values with fillna
X = TreatedComments.fillna('')['lemmacomment']
Y = TreatedComments.fillna(0)['rate']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=666)

In [364]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [365]:
# Convert the text into numerical features
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [366]:
# ----------------------------------------------------------------------------------------------
# Train a Naive Bayes classifier
clf = MultinomialNB()


In [367]:
import numpy as np
Y_train_int = np.array(Y_train).astype(int)
clf.fit(X_train_vec, Y_train_int)

In [368]:
# Predict on the test set and calculate accuracy
Y_pred = clf.predict(X_test_vec)

In [369]:
# Since the Y_test is a float, we need to convert it to int to be able to calculate the accuracy
Y_test_int = np.array(Y_test).astype(int)
accuracy = accuracy_score(Y_test_int, Y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.6334153943313588


In [370]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(Y_test_int, Y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       296
           1       0.82      0.02      0.03      4243
           2       0.00      0.00      0.00      4890
           3       0.33      0.02      0.04     14188
           4       0.37      0.08      0.13     41236
           5       0.65      0.99      0.78    106158

    accuracy                           0.63    171011
   macro avg       0.36      0.18      0.16    171011
weighted avg       0.54      0.63      0.52    171011



In [371]:
new_comment = 'love but have a problem'
new_comment_vec = vectorizer.transform([new_comment])
new_rate = clf.predict(new_comment_vec)[0]
print('Predicted rate:', new_rate)

Predicted rate: 5


In [372]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [373]:
# Convert the text into numerical features
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [374]:
# ----------------------------------------------------------------------------------------------
# Train a decision tree classifier
clf = DecisionTreeClassifier()

In [375]:
# Decision tree needs int values
Y_train_int = np.array(Y_train).astype(int)
clf.fit(X_train_vec, Y_train_int)


In [376]:
# Predict on the test set and calculate accuracy
Y_pred = clf.predict(X_test_vec)


In [377]:
# Since the Y_test is a float, we need to convert it to int to be able to calculate the accuracy
Y_test_int = np.array(Y_test).astype(int)
accuracy = accuracy_score(Y_test_int, Y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.5730625515317728


In [378]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(Y_test_int, Y_pred))

              precision    recall  f1-score   support

           0       0.01      0.01      0.01       296
           1       0.28      0.22      0.25      4243
           2       0.16      0.11      0.13      4890
           3       0.25      0.21      0.23     14188
           4       0.31      0.26      0.28     41236
           5       0.71      0.78      0.74    106158

    accuracy                           0.57    171011
   macro avg       0.28      0.26      0.27    171011
weighted avg       0.54      0.57      0.56    171011



In [379]:
new_comment = 'love but have a problem'
new_comment_vec = vectorizer.transform([new_comment])
new_rate = clf.predict(new_comment_vec)[0]
print('Predicted rate:', new_rate)

Predicted rate: 5
