# Assignment 02

In [215]:
import json
import os
import nltk
import string
import re
import pandas as pd
from IPython.display import display
import numpy as np
import math

### Importing crawled documents

In [None]:
directory = 'sitespider/sites'
files = [x[2] for x in os.walk(directory)][0]
pages = []

for file in files:
    with open("%s/%s" % (directory, file)) as json_data:
        pages += [json.load(json_data)]

## c) Calculate the PageRanks of the downloaded pages

### Constants

In [216]:
n = len(pages)
t = 0.05
d =  1 - t
δ = 0.04 

### Functions

**PageRank**

In [221]:
def calculate_page_rank(page_i):
    sum_result = 0  
    for page_j in data:
        if page_i['id'] in page_j['back_links']:
            sum_result += page_j['old_rank'] / len(page_j['back_links'])
        elif len(page_j['back_links']) == 0:
            sum_result += page_j['old_rank'] / n
    return d * sum_result + t / n

**Helper Functions**

In [222]:
def initialize():
    for page in pages:
        page['rank'] = 1/n

def termination_condition():
    dif_sum = 0
    for page in pages:
        if not 'old_rank' in page:
            return False
        else:
            dif_sum += abs(page['rank'] - page['old_rank'])
    return  dif_sum <= δ

def set_old_rank():
    for page in pages:
        page['old_rank'] = page['rank']

### Calculating PageRanks for all Documents

In [223]:
initialize()
while not termination_condition():
    set_old_rank()
    for page in pages:   
        page['rank'] = calculate_page_rank(page)

### Saving PageRanks

In [224]:
with open('rank.txt', 'w') as f:
    for page in pages:
        f.write("%s: %s\n" % (instance['url'], instance['rank']))

### Checking the sum of all PageRanks

In [225]:
rank_sum = 0
for page in pages:
    rank_sum += page['rank']
print(rank_sum)

0.9999999999999997


## d) Build a tf-Index for the words contained in the documents

In [237]:
tf_dict = {}
term_set = set()
stopwords = []
exclude = set(string.punctuation)
porter = nltk.PorterStemmer()

with open('stop_words.txt') as line:
    stopwords += re.sub('[^a-zA-Z0-9,]', '', line.read()).split(',')

### Calculation of unique terms in documents

In [238]:
for page in pages:
    for term in nltk.word_tokenize(page['text']):
        if term not in exclude and term not in stopwords:
            term_set.add(porter.stem(term).lower())

### Calculation of Term Frequency

In [239]:
for instance in pages:
    tf_dict[instance['id']] = {}
    for term in term_set:
        tf_dict[instance['id']][term] = 0
    for term in nltk.word_tokenize(instance['text']):
        if term not in exclude and term not in stopwords:
            tf_dict[instance['id']][porter.stem(term).lower()] += 1

### Functions

**Weighted Term Frequency**

In [188]:
def get_weighted_tf(doc_id, term):
    if term in tf_dict[doc_id]:
        if tf_dict[doc_id][term] == 0:
            return 0
        else:
            return 1 + math.log10(tf_dict[doc_id][term])
    else:
        return 0

### Saving Term Frequency (TF-Index)

In [240]:
tf_df = pd.DataFrame(tf_dict)
tf_df.to_csv('index.txt', header=True, index=True, sep=';')

## e) Implement a function search to search for documents containing given words

In [241]:
df_dict = {}

### Calculation of Document Frequency

In [242]:
for page in pages:
    for term in nltk.word_tokenize(instance['text']):
        if term not in exclude and term not in stopwords:
            if porter.stem(term).lower() in df_dict and page['id'] not in df_dict[porter.stem(term).lower()]['documents']:
                df_dict[porter.stem(term).lower()]['count'] += 1
                df_dict[porter.stem(term).lower()]['documents'] += [page['id']]
            elif porter.stem(term).lower() not in df_dict:
                df_dict[porter.stem(term).lower()] = {'count': 1,
                                                      'documents': [instance['id']]
                                                     }          

### Functions

**Weighted Inverse Document Frequency**

In [234]:
def get_weighted_idf(term):
    if term in df_dict:
        if df_dict[term]['count'] == 0:
            return 0
        else:
            return math.log10( n / df_dict[term]['count'])
    else:
        return 0

**Weighted TF-IDF**

In [None]:
def get_weighted_tf_idf(doc_id, term):
    return get_weighted_tf(doc_id, term) * get_weighted_idf(term)

**Search**

In [183]:
def search(terms, page_rank=False):
    result = {}
    for page in pages:
        result[instance['id']] = 0
        for term in terms:
            result[page['id']] += get_weighted_tf_idf(page['id'], porter.stem(term).lower())
            if page_rank:
                result[page['id']] *= page['rank']
    return result

### Searching with TF-IDF 

In [231]:
print(search(['token']))
print(search(['index']))
print(search(['classification']))
print(search(['classification', 'token']))

{'d01': 0.26556622014976977, 'd02': 0.30150996489407533, 'd03': 0.26556622014976977, 'd04': 0.30150996489407533, 'd05': 0.0, 'd06': 0.0, 'd07': 0.0, 'd08': 0.32701245764361475}
{'d01': 0.0, 'd02': 0.0, 'd03': 0.0, 'd04': 0.6824274635301139, 'd05': 0.5541980979011975, 'd06': 0.0, 'd07': 0.0, 'd08': 0.6824274635301139}
{'d01': 0.0, 'd02': 0.0, 'd03': 0.0, 'd04': 0.0, 'd05': 0.0, 'd06': 0.4259687322722811, 'd07': 0.4259687322722811, 'd08': 0.6824274635301139}
{'d01': 0.26556622014976977, 'd02': 0.30150996489407533, 'd03': 0.26556622014976977, 'd04': 0.30150996489407533, 'd05': 0.0, 'd06': 0.4259687322722811, 'd07': 0.4259687322722811, 'd08': 1.0094399211737286}


## f) Extend your search function and include PageRank to score the documents

### Searching with TF-IDF and PageRank

In [232]:
print(search(['token'], page_rank=True))
print(search(['index'], page_rank=True))
print(search(['classification'], page_rank=True))
print(search(['classification', 'token'], page_rank=True))

{'d01': 0.031140856774395202, 'd02': 0.03782341969469127, 'd03': 0.03331439677287726, 'd04': 0.0674361001749996, 'd05': 0.0, 'd06': 0.0, 'd07': 0.0, 'd08': 0.0023838038549791668}
{'d01': 0.0, 'd02': 0.0, 'd03': 0.0, 'd04': 0.15263258980165115, 'd05': 0.06498644135406392, 'd06': 0.0, 'd07': 0.0, 'd08': 0.004974652128022696}
{'d01': 0.0, 'd02': 0.0, 'd03': 0.0, 'd04': 0.0, 'd05': 0.0, 'd06': 0.0604090817955079, 'd07': 0.0604090817955079, 'd08': 0.004974652128022696}
{'d01': 0.031140856774395202, 'd02': 0.03782341969469127, 'd03': 0.03331439677287726, 'd04': 0.0674361001749996, 'd05': 0.0, 'd06': 0.008566960170784888, 'd07': 0.008566960170784888, 'd08': 0.0024200672897284462}
