In [175]:
from helper_fn import *
import re
import spacy
from spacy.tokenizer import Tokenizer
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from nltk import WordNetLemmatizer, PorterStemmer

# TODO LIST (April 1st, 2022) 

1. Identify different frequencies of words in each video. Normalize per total words. 
2. Identify words that have highest weight for each video. Pick most common topics between the two.
3. Descriptive statisitcs for each class (Done)
4. Try LASSO sparse regression on all variables Xiao used.
5. Remove transition words and handle weird cases. 

In [24]:
# Download videos with subtitles
df600 = pd.read_csv("merged_and_cleaned600.csv", sep = ",").drop(columns = ["Unnamed: 0"])
df600 = df600[df600["subtitle"] != "[]"]

In [156]:
def processText(text, custom_nlp = spacy.load("en_core_web_sm")):
    text = " ".join(text.split("\\n")) # Strip \\n, which appears at the end
    doc = nlp(text, disable = ["parser"])
    
    # Remove all punctuations, stopwords, numbers
    # Store the lower, lemmatized form
    temp = [token.lemma_.translate(str.maketrans('', '', string.punctuation)).lower()
            for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    return temp

In [159]:
# List of undesired words
suffix = ["for","and","but","or","yet","so"]
article = ["the"]
position = ["between", "while", "this", "by","it", "across"]
qualifier = ["may"]
appendix = suffix + article + position + qualifier

# Create regex to filter out these words
nlp = spacy.load("en_core_web_sm")
suffix = nlp.Defaults.suffixes + appendix
suffix_regex = spacy.util.compile_suffix_regex(suffix)
nlp.tokenizer.suffix_search = suffix_regex.search

processText(df600["subtitle"].iloc[0], custom_nlp = nlp)

['hyperkalemia',
 'refer',
 'abnormally',
 'high',
 'level',
 'potassium',
 'blood',
 'normal',
 'circumstance',
 'total',
 'body',
 'potassium',
 'intracellular',
 'remain',
 'extracellular',
 'fluid',
 'blood',
 'plasma',
 'ratio',
 'intracellular',
 'extracellular',
 'potassium',
 'important',
 'generation',
 'actionpotential',
 'essential',
 'normal',
 'function',
 'neuron',
 'skeletal',
 'muscle',
 'cardiac',
 'muscle',
 'potassium',
 'level',
 'blood',
 'strictly',
 'regulate',
 'narrow',
 'range',
 '5mmol',
 'l',
 'normal',
 'daily',
 'dietary',
 'intake',
 'potassium',
 'varie',
 'widely',
 '100mmol',
 'day',
 'body',
 'quickly',
 'precisely',
 'react',
 'bloodpotassium',
 'level',
 'normal',
 'limit',
 'achieve',
 'mechanism',
 'excretion',
 'potassium',
 'kidney',
 'intestine',
 'kidney',
 'play',
 'predominant',
 'role',
 'shifting',
 'potassium',
 'extracellular',
 'fluid',
 'cell',
 'sodium',
 'potassiumpump',
 'pump',
 'mainly',
 'regulate',
 'hormone',
 'insulin',
 'cate

# Topic Modelling

In [169]:
# List of protected words
stopword_lst = list(set(stopwords.words('english'))) 
stopword_lst += ["nof","nthe", "one", "thing", "quot"]

keyword = ["intracellular", "extracellular", "myocardia"]
colloquial = ["see","going","know", "go", "really"]
stopword_lst += colloquial

In [173]:
# Create a function to perform topic modelling. 
# TODO: Ask if there's any word we should remove from the caption
# NOTE: Problems with cleanText
def cleanText(text, stopword_lst = list(set(stopwords.words('english'))), return_string = False):
    """
    @param text: A string of text to clean by removing punctuations, stopwords, and splitting bad tails.
    @param stopword_lst: A list of words we wish to remove. Usually used the list of stopwords in nltk + sth.
    @param return_string: A Boolean indicating we want to return a list of words or a long string. 
    Return: A list of keywords for each sentence
    NOTE: Use return_string = True for word cloud; False for topic modelling (bag of words).
    """
    text_lst = str(text).split("\\n") # Youtube subtitle denotes lines as \\n, so cannot directly remove punctuations.
    temp = []

    # Create a list of words for each video's subtitle, excluding all stopwords. 
    lm = WordNetLemmatizer()
    ps = PorterStemmer()
    for text in text_lst:
        temp += [lm.lemmatize(word) for word in simple_preprocess(text, deacc=True) if word not in stopword_lst]
    # For word cloud or topic modelling.
    if return_string:
        return ' '.join(temp)
    return temp


def generateWordCloud(df, col_name, stopword_lst = [], to_file = False, filename = ""):
    """
    @param df: A dataframe whose column consists of texts we want to clean
    @param col_name: A string of column name whose value is a text we want to clean.
    @param stopword_lst: A list of words to exclude from our model
    @param to_file: A boolean denoting whether we save the word cloud or not.
    @param filename: A string denoting name of files we save (check if to_file == True)
    Usage: Generate a word cloud showing the most frequent words appearing in columns of text.
    NOTE: To generate a word cloud for a specific video, simply index by conditions,
    such as video_id, channel creator, understandable, actionable, etc.
    """
    assert col_name in df.columns.tolist()
    # Pre-process the text into long string.
    text = ""
    for video_subtitle in df[col_name].tolist():
        text += cleanText(video_subtitle, stopword_lst, return_string = True)

    # Generate a word cloud
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
    wordcloud.generate(text)
    # Visualize a word cloud
    wordcloud.to_image()
    plt.figure(figsize = (9,6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    if to_file:
        assert len(filename) >= 1 # Ensure we save into legitimate filenames
        print("Saving %s" %(filename))
        wordcloud.to_file(filename)

In [6]:
# Building a bag of subtitles for each video
# See https://radimrehurek.com/gensim/corpora/dictionary.html
# See https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

temp = df600[df600["info"] == 1]["subtitle"]
# Create a list of words for each video subtitle. 
subtitle_lst = [cleanText(subtitle, stopword_lst, return_string = False) for subtitle in temp]
# Convert each word into dictionary. This allows us to perform mapping in the future.
id2word = corpora.Dictionary(subtitle_lst)
# Build a corpus in term document frequency
corpus = [id2word.doc2bow(subtitle) for subtitle in subtitle_lst]

# number of topics
num_topics = 3
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,id2word=id2word, num_topics=num_topics)

# Print the Keyword in the topics
# Possible interpretation: Different aspects of diabetes.
# Word counts on two classes
for i in range(len(lda_model.print_topics())):
    print(lda_model.print_topics()[i])
    
# TODO: Account for frequencies in video duration.

In [177]:
# Word cloud for I(dependent) = 0.
dependent = "info"
nu_df = df600[df600[dependent] == 0]
filename = "wordcloud_NOT_%s.png" %(dependent)
generateWordCloud(nu_df, "subtitle", stopword_lst = stopword_lst, to_file = True, filename = filename) 

TypeError: generateWordCloud() got an unexpected keyword argument 'stopword_lst'

In [178]:
# Word cloud for I(dependent) = 1.
u_df = df600[df600[dependent] == 1]
filename = "wordcloud_%s.png" %(dependent)
generateWordCloud(u_df, "subtitle", stopword_lst = stopword_lst, to_file = True, filename = filename)

TypeError: generateWordCloud() got an unexpected keyword argument 'stopword_lst'

In [49]:
cleanText(df600["subtitle"].iloc[0], return_string = False)

['hyperkalemia',
 'refers',
 'abnormally',
 'high',
 'level',
 'potassium',
 'blood',
 'normal',
 'circumstance',
 'total',
 'body',
 'potassium',
 'intracellular',
 'remaining',
 'extracellular',
 'fluid',
 'blood',
 'plasma',
 'ratio',
 'intracellular',
 'extracellular',
 'potassium',
 'important',
 'generation',
 'essential',
 'normal',
 'function',
 'neuron',
 'skeletal',
 'muscle',
 'cardiac',
 'muscle',
 'potassium',
 'level',
 'blood',
 'strictly',
 'regulated',
 'within',
 'narrow',
 'rangebetween',
 'mmol',
 'normal',
 'daily',
 'dietary',
 'intake',
 'potassium',
 'varies',
 'widelyand',
 'much',
 'mmol',
 'day',
 'body',
 'must',
 'quickly',
 'precisely',
 'react',
 'keep',
 'bloodpotassium',
 'level',
 'within',
 'normal',
 'limit',
 'achieved',
 'mechanism',
 'excretion',
 'potassium',
 'kidneysand',
 'intestine',
 'kidney',
 'playing',
 'predominant',
 'role',
 'shifting',
 'potassium',
 'extracellular',
 'fluid',
 'cell',
 'sodium',
 'potassiumpump',
 'pump',
 'mainly',


In [50]:
ps = PorterStemmer()
lm = WordNetLemmatizer()
word = "increasing"
ps.stem(word)

'increas'

In [46]:
lmt = WordNetLemmatizer()
lmt.lemmatize(word)

'nigricans'

In [24]:
temp

'consist'