In [1]:
c
import string
import math

In [2]:
with open("IMDb.csv", "r", encoding = "UTF-8") as f:
    IMDb = list(csv.reader(f))

In [3]:
IMDb = IMDb[1:]

In [4]:
def tokenize(docs):
    """
    Input: a list of strings. Each item is a document to tokenize.
    Output: a list of lists. Each item is a list containing the tokens of the relative document.
    """
    tokens = []
    for doc in docs:
        for punct in string.punctuation:
            doc = doc.replace(punct, " ")
        tokens.append([token.lower() for token in doc.split(" ") if token])
    return tokens

In [5]:
docs = [line[0] for line in IMDb]
tokens = tokenize(docs)

In [6]:
def compute_TF(tokens):
    """
    Input: a list of lists. Each item is a list containing the tokens of the relative document.
    Output: a list of dictionaries. Each item is a dictionary containing the frequency of the token in the document.
    """
    TF = []
    for doc_tokens in tokens:
        TF.append({token: doc_tokens.count(token) for token in set(doc_tokens)})
    return TF

In [7]:
TF = compute_TF(tokens)

In [8]:
def compute_DF(tokens):
    """
    Input: a list of lists. Each item is a list containing the tokens of the relative document.
    Output: a dictionary. Each item contain the document frequency.
    """
    DF = {}
    for doc_tokens in tokens:
        for token in set(doc_tokens):
            DF[token] = DF.get(token, 0) + 1
    return DF

In [9]:
DF = compute_DF(tokens)

In [10]:
def compute_IDF(DF, N):
    """
    Input: a dictionary. Each item contain the document frequency.
    Input: the total number of documents
    Output: a dictionary. Each item contain the inverse document frequency.
    """
    IDF = {}
    for token in DF:
        IDF[token] = math.log(N / DF.get(token))
    return IDF

In [11]:
N = len(IMDb)
IDF = compute_IDF(DF, N)

In [12]:
words = list(IDF)
words.sort(key = lambda word: IDF.get(word))

In [13]:
print(words[0:10])
print(words[-10:])

['the', 'a', 'and', 'of', 'to', 'this', 'is', 'it', 'in', 'that']
['prousalis', 'roué', 'infantalising', 'orientalist', 'jayden', 'imy', 'camora', 'capiche', 'jowls', 'repleat']


In [14]:
def compute_TF_IDF(TF, IDF):
    """
    Input: a list of dictionaries. Each item is a dictionary containing the frequency of the token in the document.
    Input: a dictionary. Each item contain the inverse document frequency.
    Output: a list of dictionaries. Each item is a dictionary containing the TF-IDF of the token in the document.
    """
    TF_IDF = []
    for doc_TF in TF:
        TF_IDF.append({token: doc_TF.get(token) * IDF.get(token) for token in doc_TF})
    return TF

In [15]:
TF_IDF = compute_TF_IDF(TF, IDF)

In [None]:
def norm(d):
    """Compute the L2-norm of a vector representation."""
    return sum([d.get(k) ** 2 for k in d]) ** 0.5

In [None]:
def dot_product(d1, d2):
"""Compute the dot product between two vector representations."""
word_set = set(list(d1.keys()) + list(d2.keys()))
return sum([( d1.get(d, 0.0) * d2.get(d, 0.0)) for d in word_set ])

In [None]:
def cosine_similarity(d1, d2):
"""
Compute the cosine similarity between documents d1 and d2.
Input: two dictionaries representing the TF-IDF vectors for documents
d1 and d2.
Output: the cosine similarity.
"""
return dot_product(d1, d2) / (norm(d1) * norm(d2))