In [1]:
import pandas as pd
import re
from collections import Counter 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv("data/DatasetFinal.csv", index_col=0)
df.head(2)

Unnamed: 0,Course Title,Rating,Level,Schedule,What you will learn,Skill gain,Modules,Instructor,Offered By,Keyword,Course Url,Duration to complete (Approx.),Number of Review
0,Fashion as Design,4.8,Beginner level,Flexible schedule,Not specified,"Art History, Art, History, Creativity","Introduction, Heroes, Silhouettes, Coutures, L...","Anna Burckhardt, Paola Antonelli, Michelle Mil...",The Museum of Modern Art,Arts and Humanities,https://www.coursera.org/learn/fashion-design,20.0,2813
1,Modern American Poetry,4.4,Beginner level,Flexible schedule,Not specified,Not specified,"Orientation, Module 1, Module 2, Module 3, Mod...",Cary Nelson,University of Illinois at Urbana-Champaign,Arts and Humanities,https://www.coursera.org/learn/modern-american...,34.0,100


In [3]:
df.shape

(8342, 13)

In [18]:
def process_data(titles):
    """
    Input: 
        Colum of cour 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    words = [] # return this variable correctly
    #title = []
    
    #Create a list with all the titles
    title = [i for i in titles]
    print("Number of titles: ",len(title))

    #Convert all the titles to a string
    text = ""
    for texts in title:
        text = text + ' ' + texts
    print("Number of characters: ", len(text))    
    
    # convert all letters to lower case
    text_lowercase = text.lower()

    # some regex to tokenize the string to words and return them in a list
    words = re.findall(r'\w+', text_lowercase)
    print('Number of words: ',len(words))
    
    return words

In [19]:
words = process_data(df["Course Title"])

Number of titles:  8342
Number of characters:  362835
Number of words:  46836


# Create Dictionary with Frequencies
- The dictionary's keys are words.
- The value for each word is the number of times that word appears in the corpus.

In [20]:
# create vocab
vocab = set(words)
print(vocab)
print('Number of different words: ',len(vocab))

Number of different words:  6275


In [22]:
# create vocab including word count
counts = {}
counts = Counter(words)
print('Number of different words: ',len(counts)) 
print(f"The count for the word 'specialization' is {counts.get("specialization")}")

The word 'specialization' in 1360
Number of different words:  6275


# Dictionary of Probabilities 
- Compute the probability that each word will appear

$$P(w_i) = \frac{C(w_i)}{M} \tag{Eqn-2}$$

where 

$C(w_i)$ is the total number of times $w_i$ appears in the corpus.

$M$ is the total number of words in the corpus.

In [24]:
def get_probs(word_count_dict):
    '''
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly

    # get the total count of words for all words in the dictionary
    m = sum(word_count_dict.values())
    probs = {word: count/m for word, count in word_count_dict.items()}

    return probs

In [32]:
probs = get_probs(counts)
print(f"The probability for the word 'specialization' is {probs.get("specialization")}")
print(f"The probability for the word 'fashion' is {probs.get("fashion")}")

The probability for the word 'specialization' is 0.029037492527115894
The probability for the word 'fashion' is 0.0002989153642497224
