# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** *O*

**Names:**

* *Argelaguet Franquelo, Pau*
* *du Bois de Dunilac, Vivien*

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [1]:
import pickle
import numpy as np
import string
import collections
import operator
import math

from functools import reduce
from scipy.sparse import csr_matrix
from utils import load_json, load_pkl

from nltk.stem import SnowballStemmer

In [2]:
courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

## Exercise 4.1: Pre-processing

In [3]:
stemmer = SnowballStemmer("english")

In [4]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def filter_word(word):
    # Removing words in stopwords
    if word in stopwords:
        return False
    # Removing words consisting of a punctuation sign
    if word in string.punctuation:
        return False
    # Removing numbers
    if is_number(word):
        return False
    return True


def clean_word(word):
    # Removing punctuation signs from word
    word = "".join(c for c in word if c not in string.punctuation)
    # Transforming word to lowercase
    word = word.lower()
    # Stemming
    word = stemmer.stem(word)
    return word
    

def get_bag_of_words(text):
    words = filter(filter_word, map(clean_word, text.split()))
    bow = collections.defaultdict(int)
    for w in words:
        bow[w] += 1
    
    # Removing less frequent words
    bow = {k: v for k, v in bow.items() if v > 1}
            
    return dict(collections.OrderedDict(sorted(bow.items())))

In [5]:
dat = {
    x.get('courseId'): {
        'name': x.get('name'),
        'description': get_bag_of_words(x.get('description'))
    } for x in courses
}

In [6]:
list_terms = list(x.get('description') for x in dat.values())

terms = collections.defaultdict(int)
for l in list_terms:
    for k, v in l.items():
        terms[k] += v
            
terms = dict(collections.OrderedDict(sorted(terms.items(), key=operator.itemgetter(1), reverse=True)))
top_terms = list({k: v for k, v in terms.items() if v > 400}.keys())
low_terms = list({k: v for k, v in terms.items() if v < 3}.keys())

In [7]:
for k, v in dat.items():
    dat[k]['description'] = {x: y for x, y in v.get('description').items() if x not in top_terms}
    
# Ensuring all documents have description
dat = {k: v for k, v in dat.items() if len(v.get('description')) > 0}

In [8]:
dat['COM-308']['description']

{'ad': 2,
 'algebra': 2,
 'algorithm': 2,
 'analyt': 2,
 'applic': 2,
 'auction': 2,
 'base': 2,
 'class': 3,
 'cluster': 2,
 'communiti': 2,
 'comput': 2,
 'data': 6,
 'dataset': 2,
 'detect': 2,
 'ecommerc': 2,
 'explor': 5,
 'graph': 2,
 'hadoop': 2,
 'homework': 2,
 'inform': 2,
 'internet': 2,
 'lab': 3,
 'largescal': 3,
 'linear': 2,
 'machin': 2,
 'mine': 3,
 'network': 4,
 'number': 2,
 'onlin': 5,
 'practic': 2,
 'problem': 2,
 'realworld': 4,
 'recommend': 3,
 'relat': 2,
 'servic': 3,
 'session': 2,
 'social': 5,
 'stream': 2}

In [9]:
with open("data/preprocess.pckl", "wb") as f:
    pickle.dump(dat, f)

## Exercise 4.2: Term-document matrix

In [10]:
list_terms = [list(x.get('description').keys()) for x in dat.values()]

terms = set()
for l in list_terms:
    terms = terms.union(set(l))
terms = sorted(terms)

In [11]:
documents = sorted(list(dat.keys()))

In [12]:
M = len(terms)
N = len(documents)

term_idx = {x: terms.index(x) for x in terms}
doc_idx = {x: documents.index(x) for x in documents}

In [13]:
values = []
rows = []
columns = []

In [14]:
occs = collections.defaultdict(float)
for k, v in dat.items():
    for x in v.get('description').keys():
        occs[x] += 1
idf = {k: math.log(N/v) for k, v in occs.items()}

In [18]:
for k, v in dat.items():
    d = v.get('description')
    max_occur = float(max(d.values()))
    for x, y in d.items():
        tf = y / max_occur
        tfidf = tf * idf[x]
        values.append(tfidf)
        rows.append(term_idx[x])
        columns.append(doc_idx[k])

In [19]:
mat = csr_matrix((values, (rows, columns)), shape=(M, N))

In [33]:
print(mat[:,doc_idx['COM-308']])

  (61, 0)	5.634789603169249
  (114, 0)	3.3322045101752034
  (115, 0)	2.6729588812909397
  (140, 0)	3.900188547781143
  (180, 0)	1.8355620918864477
  (242, 0)	6.733401891837359
  (291, 0)	2.883254290127301
  (577, 0)	3.6440051979497845
  (591, 0)	5.123963979403259
  (628, 0)	5.3471075307174685
  (651, 0)	2.1484344131667874
  (806, 0)	6.667627155961528
  (810, 0)	5.3471075307174685
  (869, 0)	3.8430301339411947
  (991, 0)	6.733401891837359
  (1159, 0)	10.421131335939558
  (1385, 0)	4.168452534375823
  (1417, 0)	6.733401891837359
  (1493, 0)	3.3661060618508847
  (1616, 0)	2.6729588812909397
  (1666, 0)	5.634789603169249
  (1774, 0)	4.9983067652628055
  (1800, 0)	6.980940525236285
  (1866, 0)	2.690350624002809
  (1924, 0)	3.43756502583303
  (2071, 0)	8.020661296076202
  (2219, 0)	5.898424515838196
  (2285, 0)	4.0943445622221
  (2320, 0)	11.968729356955116
  (2577, 0)	2.3513752571634776
  (2616, 0)	1.9627172673716942
  (2730, 0)	10.247927958806518
  (2743, 0)	5.271789100453738
  (2777, 0)	2

## Exercise 4.3: Document similarity search