In [1]:
import pandas as pd
import re
from collections import Counter 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv("data/DatasetFinal.csv", index_col=0)
df.head(2)

Unnamed: 0,Course Title,Rating,Level,Schedule,What you will learn,Skill gain,Modules,Instructor,Offered By,Keyword,Course Url,Duration to complete (Approx.),Number of Review
0,Fashion as Design,4.8,Beginner level,Flexible schedule,Not specified,"Art History, Art, History, Creativity","Introduction, Heroes, Silhouettes, Coutures, L...","Anna Burckhardt, Paola Antonelli, Michelle Mil...",The Museum of Modern Art,Arts and Humanities,https://www.coursera.org/learn/fashion-design,20.0,2813
1,Modern American Poetry,4.4,Beginner level,Flexible schedule,Not specified,Not specified,"Orientation, Module 1, Module 2, Module 3, Mod...",Cary Nelson,University of Illinois at Urbana-Champaign,Arts and Humanities,https://www.coursera.org/learn/modern-american...,34.0,100


In [31]:
df.shape

(8342, 13)

In [12]:
def processing_data(titles):
    """
    Input: 
        Colum of cour 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    words = [] # return this variable correctly
    #title = []
    
    #Create a list with all the titles
    title = [i for i in titles]

    #Convert all the titles to a string
    text = ""
    for texts in title:
        text = text + " " + texts
    # convert all letters to lower case
    text_lowercase = text.lower()

    # some regex to tokenize the string to words and return them in a list
    all_words = re.findall(r'\w+', text_lowercase)
    
    return all_words

In [13]:
words = processing_data(df["Course Title"])
print(f"Number of titles {len(df["Course Title"])}")
print('Number of words: ',len(words))

Number of titles 8342
Number of words:  46836


# Create Dictionary with Frequencies
- The dictionary's keys are words.
- The value for each word is the number of times that word appears in the corpus.


In [14]:
# create vocab
vocab = set(words)
print('Number of different words: ',len(vocab))

Number of different words:  6275


In [15]:
def counting_words(the_words):
    '''
    Input: 
        the_words: set of words of the corpus
    Output:
        dict_word_count: Dictionary where the key is a word and the value is the number of times it appears.
    '''
    dict_count = {}
    dict_count = Counter(the_words)
    return dict_count

dict_word_count  = counting_words(words)
print('Number of different words: ',len(dict_word_count)) 
print(f"The count for the word 'specialization' is {dict_word_count.get("specialization")}")

Number of different words:  6275
The count for the word 'specialization' is 1360


# Dictionary of Probabilities 
- Compute the probability that each word will appear

$$P(w_i) = \frac{C(w_i)}{M} \tag{Eqn-2}$$

where 

$C(w_i)$ is the total number of times $w_i$ appears in the corpus.

$M$ is the total number of words in the corpus.

In [16]:
def get_probs(dict_count):
    '''
    Input:
        dict_count: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly

    # get the total count of words for all words in the dictionary
    m = sum(dict_count.values())
    probs = {word: count/m for word, count in dict_count.items()}

    return probs

In [17]:
probs = get_probs(dict_word_count)
print(f"The probability for the word 'specialization' is {probs.get("specialization")}")
print(f"The probability for the word 'fashion' is {probs.get("fashion")}")

The probability for the word 'specialization' is 0.029037492527115894
The probability for the word 'fashion' is 0.0002989153642497224


### Sumary:
* Set with all the different words: `Vocab`
* Dictionary with frequencies: `dict_word_count`
* Dictionary with probabilities: `prob`

# String Manipulations
- functions to manipulate strings

In [52]:
def delete_char(word):
    '''
    Input:
        word: input string
    Output:
        delete_l: a list of all possible strings with one missing character from word
    '''
    del_letter = []
    splits = []
    
    splits = [(word[:L],word[L:]) for L in range(len(word))]
    del_letter = [L+R[1:] for L,R in splits if R]

    return del_letter, splits

In [53]:
def switch_char(word):
    '''
    Input:
        word: input string
     Output:
        switch: a list of all possible strings with one adjacent charater switched from word
    ''' 
    switch = []
    splits = []
    
    splits = [(word[:L],word[L:]) for L in range(len(word))]
    switch = [L+R[1]+R[0]+R[2:] for L,R in splits if len(R)>=2]

    return switch, splits

In [59]:
del_letter, splits = delete_char("guapo")
print(f"splits = {splits}, \nword without a letter = {del_letter}")

splits = [('', 'guapo'), ('g', 'uapo'), ('gu', 'apo'), ('gua', 'po'), ('guap', 'o')], 
word without a letter = ['uapo', 'gapo', 'gupo', 'guao', 'guap']


In [58]:
switch, splits = switch_char("guapo")
print(f"splits = {splits}, \nswithes = {switch}")

splits = [('', 'guapo'), ('g', 'uapo'), ('gu', 'apo'), ('gua', 'po'), ('guap', 'o')], 
swithes = ['ugapo', 'gaupo', 'gupao', 'guaop']
