# Content based recommendations using the tf-idf algorithm

In [1]:
import pandas as pd
import numpy as np
import random
import math
from nltk.tokenize import word_tokenize
import multiprocessing as mp
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_articles = pd.read_csv("./Articles.csv", nrows=800)
df_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    800 non-null    object
 1   url     800 non-null    object
dtypes: object(2)
memory usage: 12.6+ KB


In [3]:
df_articles.describe()

Unnamed: 0,text,url
count,800,800
unique,800,800
top,Nycticebus linglom is a fossil strepsirrhine p...,https://en.wikipedia.org/wiki/34th%20Street%E2...
freq,1,1


In [4]:
df_articles.head()

Unnamed: 0,text,url
0,Nycticebus linglom is a fossil strepsirrhine p...,https://en.wikipedia.org/wiki/%3F%20Nycticebus...
1,Oryzomys pliocaenicus is a fossil rodent from ...,https://en.wikipedia.org/wiki/%3F%20Oryzomys%2...
2,.hack dt hk is a series of single player actio...,https://en.wikipedia.org/wiki/.hack%20%28video...
3,The You Drive Me Crazy Tour was the second con...,https://en.wikipedia.org/wiki/%28You%20Drive%2...
4,0 8 4 is the second episode of the first seaso...,https://en.wikipedia.org/wiki/0-8-4


## Calculate the TF-IDF Score

### Own implementation

The getTags function will give us a number of tags for each document. A tag is a word, that has a tf-idf score higher than minScore.

In [5]:
def getTags(documents, minScore):
    idfDict = mp.Manager().dict() #dictionary that maps words to their idf score of the collection
    tasks = []
    pool = mp.Pool(4)
    print("|" * 800)
    print("-"*20)
    for doc in documents:
        tasks.append(pool.apply_async(threadWorker, [doc,idfDict, documents,minScore]))
    pool.close()
    pool.join()
    return [x.get() for x in tasks]

def threadWorker(doc, idfDict, documents,minScore):
    docTags = []
    for word in set(doc):
        idfValue = idfDict.get(word, False)
        if not idfValue:
            idfValue = calculateIDF(documents, word)
            idfDict[word] = idfValue            
        tfValue = calculateTF(doc, word)                
        tfidfValue = calculateTFIDF(tfValue, idfValue)
        if(tfidfValue > minScore):
            docTags.append((word,tfidfValue))
    print("|",end="")
    return docTags

The following function calculates the thf score of a word in a document. Therefor we count the number of times the word occurs in the document and divide it by the total number of words in the document
Input:
- doc: List of words of a document
- word: the word we would like to calculate the score fore

In [6]:
def calculateTF(doc, word):
    wordCount = sum(1 for w in doc if w == word)
    return wordCount/len(doc)

The calculateIDF function calculates the IDF score of a word in the document collection.
Input:
- documents: List of lists. List of documents where each document is represented as a list of its words
- word: the word for which the idf score should be calculated

the idf score is calculated with the following formular:
$ idf(t,D) = log \frac{N}{|\{d \in D : t \in d\}|} $

with:
- $N$: count of documents
- $D$: collection of documents
- $t$: the word for which the score should be calculated

=> $\{d \in D : t \in d\}$: The documents that contain the given word $t$


In [7]:
def calculateIDF(documents, word):
    n = len(documents)
    docsThatContainCount = sum(1 for d in documents for w in d if w == word)
    return math.log(n/docsThatContainCount)

In [8]:
def calculateTFIDF(tfValue, idfValue):
    return tfValue * idfValue

In [9]:
docsList = [[x for x in word_tokenize(d) if x.isalpha()] for d in list(df_articles.text)]
result = getTags(docsList,0.005)
len(result)

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
--------------------
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

800

### Scikit-learn implementation

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vector = tfidf_vectorizer.fit_transform(df_articles.text)
tfidf_vector.toarray().shape
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf_vectorizer.get_feature_names())
tfidf_df.head()

Unnamed: 0,00,000,0000,0000utc,000100,0002,00020,00027,0003,000300,...,zurab,zurich,zvenigorodka,zviad,zvonaric,zvornik,zvulun,zwanetz,zwettl,zywicki
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.008507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.001813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.006535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
