In [1]:
import numpy as np
import pandas as pd
import math

# <span style = 'color: green'>Part 1. Query Expansion Using Automatic Global Analysis</span>

## <span style = 'color: blue'>1.a. Write a program to create a term-by-term association matrix</span>

In [2]:
doc_term = pd.read_csv('HW3.csv')

In [3]:
doc_term

Unnamed: 0,A,B,C,D,E,F,G,H
0,0,3,4,0,0,2,4,0
1,5,5,0,0,4,0,4,3
2,3,0,4,3,4,0,0,5
3,0,7,0,3,2,0,4,3
4,0,1,0,0,0,5,4,2
5,2,0,2,0,0,4,0,1
6,3,5,3,4,0,0,4,2
7,0,3,0,0,0,4,4,2
8,0,0,3,3,3,0,0,1
9,0,5,0,0,0,4,4,2


In [4]:
# Convert DataFrame to Numpy Array

DT = np.array(doc_term) # Document by Term matrix
TD = DT.T # Term by Document matrix

In [5]:
# Association Matrix (Term by Term)

TT = np.dot(TD, TD.T) 

In [6]:
TT

array([[ 47,  40,  25,  21,  32,   8,  32,  38],
       [ 40, 143,  27,  41,  34,  43, 116,  64],
       [ 25,  27,  54,  33,  25,  16,  28,  31],
       [ 21,  41,  33,  43,  27,   0,  28,  35],
       [ 32,  34,  25,  27,  45,   0,  24,  41],
       [  8,  43,  16,   0,   0,  77,  60,  30],
       [ 32, 116,  28,  28,  24,  60, 112,  56],
       [ 38,  64,  31,  35,  41,  30,  56,  61]], dtype=int64)

In [7]:
# Normalizing the matrix

norm_tt = np.zeros(TT.shape)
for i in range(TT.shape[0]):
    row = TT[i]
    for j in range(row.shape[0]):
        norm_tt[i][j] = TT[i][j] / (TT[i][i] + TT[j][j] - TT[i][j])

In [8]:
norm_tt

array([[1.        , 0.26666667, 0.32894737, 0.30434783, 0.53333333,
        0.06896552, 0.2519685 , 0.54285714],
       [0.26666667, 1.        , 0.15882353, 0.28275862, 0.22077922,
        0.24293785, 0.83453237, 0.45714286],
       [0.32894737, 0.15882353, 1.        , 0.515625  , 0.33783784,
        0.13913043, 0.20289855, 0.36904762],
       [0.30434783, 0.28275862, 0.515625  , 1.        , 0.44262295,
        0.        , 0.22047244, 0.50724638],
       [0.53333333, 0.22077922, 0.33783784, 0.44262295, 1.        ,
        0.        , 0.18045113, 0.63076923],
       [0.06896552, 0.24293785, 0.13913043, 0.        , 0.        ,
        1.        , 0.46511628, 0.27777778],
       [0.2519685 , 0.83453237, 0.20289855, 0.22047244, 0.18045113,
        0.46511628, 1.        , 0.47863248],
       [0.54285714, 0.45714286, 0.36904762, 0.50724638, 0.63076923,
        0.27777778, 0.47863248, 1.        ]])

In [9]:
# Convert numpy array to DataFrame to clearly show the result 

term_term = pd.DataFrame(norm_tt, 
                         columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], 
                         index = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
                        )
term_term

Unnamed: 0,A,B,C,D,E,F,G,H
A,1.0,0.266667,0.328947,0.304348,0.533333,0.068966,0.251969,0.542857
B,0.266667,1.0,0.158824,0.282759,0.220779,0.242938,0.834532,0.457143
C,0.328947,0.158824,1.0,0.515625,0.337838,0.13913,0.202899,0.369048
D,0.304348,0.282759,0.515625,1.0,0.442623,0.0,0.220472,0.507246
E,0.533333,0.220779,0.337838,0.442623,1.0,0.0,0.180451,0.630769
F,0.068966,0.242938,0.13913,0.0,0.0,1.0,0.465116,0.277778
G,0.251969,0.834532,0.202899,0.220472,0.180451,0.465116,1.0,0.478632
H,0.542857,0.457143,0.369048,0.507246,0.630769,0.277778,0.478632,1.0


## <span style = 'color: blue'>1.b. Write a program that outputs an expanded query </span>

In [10]:
# Function get term by term matrix and query and output an expanded query using global analysis

def expand_query(tt_matrix, query, N):
    expanded_query = []
    for term in query:
        sim_terms = tt_matrix.sort_values(term)[term]
        expanded_query.append(term)
        # first get (N+1) similar terms to the target term. Since the most similar term is
        # the term itself with the value of 1, so we will get the first N indexes from the result
        selected_terms = sim_terms[-(N+1):][:N].index
        for index in selected_terms:
            if index not in expanded_query:
                expanded_query.append(index)
    
    return expanded_query
        

In [11]:
Query = ['A', 'C', 'F']
N = 1

expand_query(term_term, Query, N)

['A', 'H', 'C', 'D', 'F', 'G']

In [12]:
Query = ['A', 'C', 'F']
N = 2

expand_query(term_term, ['A', 'C', 'F'], 2)

['A', 'E', 'H', 'C', 'D', 'F', 'G']

# <span style = 'color: green'>Part 4. Document Categorization</span>

### Loading Data

In [13]:
df = pd.read_csv('HW4.csv', index_col = 0)
df

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,Category
DOC1,2,0,4,3,0,1,0,2,Cat1
DOC2,0,2,4,0,2,3,0,0,Cat1
DOC3,4,0,1,3,0,1,0,1,Cat2
DOC4,0,1,0,2,0,0,1,0,Cat2
DOC5,0,0,2,0,0,4,0,0,Cat1
DOC6,1,1,0,2,0,1,1,3,Cat2
DOC7,2,1,3,4,0,2,0,2,Cat2
DOC8,3,1,0,4,1,0,2,1,?
DOC9,0,0,3,0,1,5,0,1,?


In [14]:
matrix = np.array(df.iloc[:7, :8])
matrix

array([[2, 0, 4, 3, 0, 1, 0, 2],
       [0, 2, 4, 0, 2, 3, 0, 0],
       [4, 0, 1, 3, 0, 1, 0, 1],
       [0, 1, 0, 2, 0, 0, 1, 0],
       [0, 0, 2, 0, 0, 4, 0, 0],
       [1, 1, 0, 2, 0, 1, 1, 3],
       [2, 1, 3, 4, 0, 2, 0, 2]], dtype=int64)

In [15]:
labels = np.array(df.iloc[:7, 8])
labels

array(['Cat1', 'Cat1', 'Cat2', 'Cat2', 'Cat1', 'Cat2', 'Cat2'],
      dtype=object)

In [16]:
DOC8 = np.array(df.iloc[7, :8])
DOC9 = np.array(df.iloc[8, :8])
DOC8 = DOC8.reshape(1, -1)
DOC9 = DOC9.reshape(1, -1)

## <span style = 'color: blue'>4.a. Using the K-Nearest-Neighbor approach</span>

In [17]:
import sklearn.metrics.pairwise as metrics

In [18]:
# Return k top most similar docs to the target doc

def knn(matrix, doc, k = 3):
    sims = []
    for d in matrix:
        sim = metrics.cosine_similarity(doc, d.reshape(1, -1))[0][0]
        sims.append(sim)
    return np.argsort(sims)[-k:]

In [19]:
# Return category of target doc based on predefined categorization (labels) and top most similar docs to the target doc

def categorize(labels, doc, sims):
    cat1 = 0
    cat2 = 0
    result = ''
    for index in sims:
        if labels[index] == 'Cat1':
            cat1 += 1
        else:
            cat2 += 1
    if cat1 > cat2:
        result = 'Cat1'
    else:
        result = 'Cat2'
    return result

In [20]:
sims = knn(matrix, DOC8)
result = categorize(labels, DOC8, sims)
print('Predicted category for DOC8:', result)

Predicted category for DOC8: Cat2


In [21]:
sims = knn(matrix, DOC9)
result = categorize(labels, DOC9, sims)
print('Predicted category for DOC9:', result)

Predicted category for DOC9: Cat1


## <span style = 'color: blue'>4.b. Using the Rocchio-Based approach</span>

In [22]:
# Return prototype vector for each category

def train(matrix, labels):
    labels_df = pd.DataFrame(labels, columns = ['cat'])
    categories = labels_df.groupby(['cat']).count().index
    result = {}
    
    for category in categories:
        prototype_vec = np.zeros(matrix.shape[1])
        indexes = np.where(labels == category)
        for vec in matrix[indexes]:
            prototype_vec = np.add(prototype_vec, vec)
        
        result[category] = prototype_vec
    
    return result

In [23]:
# classify each new doc based on precomputed prototype vectors
# Return the predicted category for the target doc

def classify(prototype_vectors, doc):
    predicted_category = ''
    max_sims = 0
    
    for key in prototype_vectors:        
        pro_vec = prototype_vectors.get(key)
        sim = metrics.cosine_similarity(doc, pro_vec.reshape(1, -1))[0][0]
        if sim > max_sims:
            max_sims = sim
            predicted_category = key
    
    return predicted_category, max_sims
        

In [24]:
# Get the prototype vactor for each category as a dictionary

prototype_vectors = train(matrix, labels)
prototype_vectors

{'Cat1': array([ 2.,  2., 10.,  3.,  2.,  8.,  0.,  2.]),
 'Cat2': array([ 7.,  3.,  4., 11.,  0.,  4.,  2.,  6.])}

In [25]:
category, similarity = classify(prototype_vectors, DOC8)

print('Predicted category: {} ; Similarity: {}'.format(category, similarity))

Predicted category: Cat2 ; Similarity: 0.8703275932210309


In [26]:
category, similarity = classify(prototype_vectors, DOC9)

print('Predicted category: {} ; Similarity: {}'.format(category, similarity))

Predicted category: Cat1 ; Similarity: 0.8971179931924131
