In [1]:
import pandas as pd
import re
from collections import Counter 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv("data/DatasetFinal.csv", index_col=0)
df.head(2)

Unnamed: 0,Course Title,Rating,Level,Schedule,What you will learn,Skill gain,Modules,Instructor,Offered By,Keyword,Course Url,Duration to complete (Approx.),Number of Review
0,Fashion as Design,4.8,Beginner level,Flexible schedule,Not specified,"Art History, Art, History, Creativity","Introduction, Heroes, Silhouettes, Coutures, L...","Anna Burckhardt, Paola Antonelli, Michelle Mil...",The Museum of Modern Art,Arts and Humanities,https://www.coursera.org/learn/fashion-design,20.0,2813
1,Modern American Poetry,4.4,Beginner level,Flexible schedule,Not specified,Not specified,"Orientation, Module 1, Module 2, Module 3, Mod...",Cary Nelson,University of Illinois at Urbana-Champaign,Arts and Humanities,https://www.coursera.org/learn/modern-american...,34.0,100


In [31]:
df.shape

(8342, 13)

In [12]:
def processing_data(titles):
    """
    Input: 
        Colum of cour 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    words = [] # return this variable correctly
    #title = []
    
    #Create a list with all the titles
    title = [i for i in titles]

    #Convert all the titles to a string
    text = ""
    for texts in title:
        text = text + " " + texts
    # convert all letters to lower case
    text_lowercase = text.lower()

    # some regex to tokenize the string to words and return them in a list
    all_words = re.findall(r'\w+', text_lowercase)
    
    return all_words

In [13]:
words = processing_data(df["Course Title"])
print(f"Number of titles {len(df["Course Title"])}")
print('Number of words: ',len(words))

Number of titles 8342
Number of words:  46836


# Create Dictionary with Frequencies
- The dictionary's keys are words.
- The value for each word is the number of times that word appears in the corpus.


In [14]:
# create vocab
vocab = set(words)
print('Number of different words: ',len(vocab))

Number of different words:  6275


In [15]:
def counting_words(the_words):
    '''
    Input: 
        the_words: set of words of the corpus
    Output:
        dict_word_count: Dictionary where the key is a word and the value is the number of times it appears.
    '''
    dict_count = {}
    dict_count = Counter(the_words)
    return dict_count

dict_word_count  = counting_words(words)
print('Number of different words: ',len(dict_word_count)) 
print(f"The count for the word 'specialization' is {dict_word_count.get("specialization")}")

Number of different words:  6275
The count for the word 'specialization' is 1360


# Dictionary of Probabilities 
- Compute the probability that each word will appear

$$P(w_i) = \frac{C(w_i)}{M} \tag{Eqn-2}$$

where 

$C(w_i)$ is the total number of times $w_i$ appears in the corpus.

$M$ is the total number of words in the corpus.

In [16]:
def get_probs(dict_count):
    '''
    Input:
        dict_count: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly

    # get the total count of words for all words in the dictionary
    m = sum(dict_count.values())
    probs = {word: count/m for word, count in dict_count.items()}

    return probs

In [17]:
probs = get_probs(dict_word_count)
print(f"The probability for the word 'specialization' is {probs.get("specialization")}")
print(f"The probability for the word 'fashion' is {probs.get("fashion")}")

The probability for the word 'specialization' is 0.029037492527115894
The probability for the word 'fashion' is 0.0002989153642497224


### Sumary:
* Set with all the different words: `vocab`
* Dictionary with frequencies: `dict_word_count`
* Dictionary with probabilities: `prob`

# String Manipulations
- functions to manipulate strings

In [130]:
def spliting(word):
    '''
    Input:
        word: input string
    Output:
        splits: a list of all possible strings dividing the word in two parts
    '''
    split = []
    split = [(word[:L],word[L:]) for L in range(len(word))]
    return split

In [131]:
def delete_letter(word):
    '''
    Input:
        word: input string
    Output:
        delete_l: a list of all possible strings with one missing letter from word
    '''
    del_letter = []
    del_letter = [L+R[1:] for L,R in spliting(word) if R]

    return del_letter

In [164]:
def switch_letter(word):
    '''
    Input:
        word: input string
     Output:
        switch: a list of all possible strings with one adjacent letter switched from word
    ''' 
    switch = []
    switch = [L+R[1]+R[0]+R[2:] for L,R in spliting(word) if len(R)>=2]

    return switch

In [165]:
def replace_letter(word):
    '''
    Input:
        word: input string 
    Output:
        replaces: a list of all possible strings where one letter is replaced from the original word. 
    '''
    letters = 'abcdefghijklmnopqrstuvwxyzáéíóú'
    
    replace = []
    replace = [L+x+(R[1:] if len(R)>1 else "") for L,R in spliting(word) for x in letters]
    return replace

In [166]:
def insert_letter(word):
    '''
    Input:
        word: input string
    Output:
        insert: a set of all possible strings with one new letter inserted in the original word
    ''' 
    letters = 'abcdefghijklmnopqrstuvwxyzáéíóú'
    insert = []
    insert = [L + x + R for L,R in spliting(word) for x in letters]
    
    return insert

In [167]:
word_manipulate = "car"
del_letter = delete_letter(word_manipulate)
print(f"splits = {spliting(word_manipulate)}, \n\nword without a letter = {del_letter}")
print("-------------------------------------------------------------------------------------")
switch = switch_letter(word_manipulate)
print(f"swithes = {switch}")
print("-------------------------------------------------------------------------------------")
replaced = replace_letter(word_manipulate)
print(f"new word with one changed letter = {replaced}")
print("-------------------------------------------------------------------------------------")
inserts = insert_letter(word_manipulate)
print(f"insertions = {inserts}")

splits = [('', 'car'), ('c', 'ar'), ('ca', 'r')], 

word without a letter = ['ar', 'cr', 'ca']
-------------------------------------------------------------------------------------
swithes = ['acr', 'cra']
-------------------------------------------------------------------------------------
new word with one changed letter = ['aar', 'bar', 'car', 'dar', 'ear', 'far', 'gar', 'har', 'iar', 'jar', 'kar', 'lar', 'mar', 'nar', 'oar', 'par', 'qar', 'rar', 'sar', 'tar', 'uar', 'var', 'war', 'xar', 'yar', 'zar', 'áar', 'éar', 'íar', 'óar', 'úar', 'car', 'cbr', 'ccr', 'cdr', 'cer', 'cfr', 'cgr', 'chr', 'cir', 'cjr', 'ckr', 'clr', 'cmr', 'cnr', 'cor', 'cpr', 'cqr', 'crr', 'csr', 'ctr', 'cur', 'cvr', 'cwr', 'cxr', 'cyr', 'czr', 'cár', 'cér', 'cír', 'cór', 'cúr', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'can', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'caá', 'caé', 'caí', 'caó', 'caú']
---------------------

# Corrections 
- One or two letters corrected

In [372]:
def one_letter_edited(word):
    """
    Input:
        word: string for which we will generate all possible words.
    Output:
        edition_one: a set of words with one possible edit. 
    """
    
    edition_one = set()
    
    edition_one.update(delete_letter(word))
    edition_one.update(switch_letter(word))
    edition_one.update(replace_letter(word))
    edition_one.update(insert_letter(word))
    
    edition_one = sorted(list(edition_one))
    
    return edition_one

In [373]:
def two_letter_edited(word):
    """
    Input:
        word: string for which we will generate all possible words.
    Output:
        edition_two: a set of words with two possible edits. 
    """
    
    edition_two = set()
    
    edit_one = one_letter_edited(word)
    for w in edit_one:
        if w:
            edit_two = one_letter_edited(w)
            edition_two.update(edit_two)

    edition_two_sorted = sorted(list(edition_two))
    
    return edition_two_sorted

In [370]:
df["Course Title"][:5]

0                                    Fashion as Design
1                               Modern American Poetry
2                            Pixel Art for Video Games
3      Distribución digital de la música independiente
4    The Blues: Understanding and Performing an Ame...
Name: Course Title, dtype: object

In [374]:
print("With one letter modified:")
incorrect_word = "Blues"
for x in one_letter_edited(incorrect_word.lower()):
    if x in vocab:
        print(x)

print("\nWith two letters modified:")    
for x in two_letter_edited(incorrect_word.lower()):
    x= x.lower()
    if x in vocab:
        print(x)

With one letter modified:
blue
blues

With two letters modified:
bases
bites
blue
blues
bones
bugs
bus
bytes
les
lies
rules
values


# Probabilities of the different corrections

In [415]:
def getting_corrections(word, probs, vocab, n=2):
    '''
    Input: 
        word: a string to check for suggestions
        probs: the dictionary with words from the corpus and its probability
        vocab: a set of all the vocabulary
        n: number of possible word corrections 
    Output: 
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    '''
    possible = []
    corrections = []
    suggestions = []

    possible =  set(one_letter_edited(word.lower()) + two_letter_edited(word.lower()))
    corrections = [[w, probs[w]] for w in possible if w in vocab]

    for word_prob in corrections: 
        suggestions.append(word_prob[0])
        
    return sorted(corrections), sorted(suggestions)

### The most probable word is...

In [429]:
correction, suggestions = getting_corrections("musica", probs, vocab, n=2)
print(f"Suggestions are {suggestions}")
max_prob = 0
for i, word_prob in enumerate(correction): 
    suggestions.append(word_prob[0])
    print(f"word {i}: {word_prob[0]} -> with a probability of {word_prob[1]:.5f}")
    if word_prob[1] > max_prob:
        max_prob = word_prob[1]
        prob_word = word_prob[0]
        
print(f"\n---The most probable word is '{prob_word}' with a probability of {max_prob:.5f}---")

Suggestions are ['básica', 'fisica', 'física', 'munich', 'music', 'musical', 'música', 'músico']
word 0: básica -> with a probability of 0.00004
word 1: fisica -> with a probability of 0.00006
word 2: física -> with a probability of 0.00009
word 3: munich -> with a probability of 0.00002
word 4: music -> with a probability of 0.00117
word 5: musical -> with a probability of 0.00028
word 6: música -> with a probability of 0.00006
word 7: músico -> with a probability of 0.00009

---The most probable word is 'music' with a probability of 0.00117---
