In [1]:

import numpy as np


# N-grams

In [64]:
# Compute n-grams and resulting instance representation
# Input: a set of messages M and n
# Output: a set of n-grams (<n_grams>) and a numpy array that contains
# the resulting instance representations for the messages

def n_gram(M, n):
  n_grams=[]
  sub_string=[[] for i in range (len(M))]
  for i in range(len(M)):
    sub_string[i]=M[i].split()
    for j in range(len(sub_string[i])-n+1):
      if tuple(sub_string[i][j:j+n]) not in n_grams:
        n_grams.append(tuple(sub_string[i][j:j+n]))
  X=np.zeros((len(M),len(n_grams)))
  for i in range(len(M)):
    sub_string[i]=M[i].split()
    for j in range(len(sub_string[i])-n+1):
      col=n_grams.index(tuple(sub_string[i][j:j+n]))
      X[i][col]+=1
  return n_grams, X

In [65]:
n=3
M=["this is a great book about a young country","this is a bad book about a great country","country music is great"]
G,X=n_gram(M,n)
print('The {} {}-grams for M are: {}'.format(len(G),n,G))
print (X)

The 14 3-grams for M are: [('this', 'is', 'a'), ('is', 'a', 'great'), ('a', 'great', 'book'), ('great', 'book', 'about'), ('book', 'about', 'a'), ('about', 'a', 'young'), ('a', 'young', 'country'), ('is', 'a', 'bad'), ('a', 'bad', 'book'), ('bad', 'book', 'about'), ('about', 'a', 'great'), ('a', 'great', 'country'), ('country', 'music', 'is'), ('music', 'is', 'great')]
[[1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]


# TF-IDF

In [67]:
# Compute TF-IDF representation for a set of messages
# Input: set of messages M
# Output: np array containing instance representation

def tfidf(M):
    n_grams = set()
    sub_string = [[] for _ in range(len(M))]

    for i in range(len(M)):
        sub_string[i] = M[i].split()
        for j in range(len(sub_string[i])):
            n_grams.add(sub_string[i][j])

    # convert set to list for indexing
    n_grams = list(n_grams)
    word2idx = {w: i for i, w in enumerate(n_grams)}


    # matrices
    X = np.zeros((len(M), len(n_grams)))
    tf = np.zeros((len(M), len(n_grams)))
    idf = np.zeros(len(n_grams))


    for i in range(len(M)):
        tokens = M[i].split()
        for t in tokens:
            col = word2idx[t]
            X[i][col] += 1

    # ---- Compute TF ----
    for i in range(len(M)):
        tf[i] = X[i] / len(sub_string[i])

    # ---- Compute IDF ----
    idf = np.log(len(M) / np.count_nonzero(X, axis=0)).reshape(1, -1)

    tfidf = tf * idf
    return tf , idf, tfidf


In [68]:
import numpy as np
n=3
M=["this is a great book about a young country","this is a bad book about a great country","country music is great"]
tf,idf,tfidf=tfidf(M)
print("\nTF:")
print(tf)
print("\nIDF:")
print(idf)
print('The TF-IDF representation for M is')
print(tfidf)


TF:
[[0.11111111 0.11111111 0.         0.11111111 0.22222222 0.11111111
  0.11111111 0.11111111 0.11111111 0.        ]
 [0.11111111 0.11111111 0.11111111 0.         0.22222222 0.11111111
  0.11111111 0.11111111 0.11111111 0.        ]
 [0.25       0.25       0.         0.         0.         0.
  0.         0.         0.25       0.25      ]]

IDF:
[[0.         0.         1.09861229 1.09861229 0.40546511 0.40546511
  0.40546511 0.40546511 0.         1.09861229]]
The TF-IDF representation for M is
[[0.         0.         0.         0.12206803 0.09010336 0.04505168
  0.04505168 0.04505168 0.         0.        ]
 [0.         0.         0.12206803 0.         0.09010336 0.04505168
  0.04505168 0.04505168 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.27465307]]


# OSB

In [66]:
# Compute orthogonal sparse bigrams and resulting instance representation
# Input: a set of messages M and OSB order m (denoted "n" in lecture)
# Output: the set of OSBs (<osb>) and a numpy array that contains
# the resulting instance representations for the messages
def OSB(M,m):
  return osb, X
