In [None]:
%pip install pandas numpy scikit-learn PyMuPDF gdown

In [1]:
import numpy as np
import pandas as pd
import os
import fitz
from sklearn.feature_extraction.text import TfidfVectorizer

### Extract text from all PDFs

In [222]:
def extract_text(fname):
    with fitz.open(fname) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])
    
    return text

In [223]:
def extract_text_from_pdfs_recursively(dir):
    raw_docs = {}
    i = 1
    for root, dirs, files in os.walk(dir):
        for file in files:
            raw_docs[f"Doc{i}"] = {}
            path_to_pdf = os.path.join(root, file)
            [stem, ext] = os.path.splitext(path_to_pdf)
            if ext == '.pdf':
                # print("Processing " + path_to_pdf)
                raw_docs[f"Doc{i}"]['category'] = stem.split("/")[1]
                raw_docs[f"Doc{i}"]['path']= path_to_pdf
                raw_docs[f"Doc{i}"]['content'] = extract_text(path_to_pdf)
                i += 1
    return raw_docs

In [224]:
raw_docs = extract_text_from_pdfs_recursively("research_papers")

In [225]:
len(raw_docs)

50

### Create collection of all docs and append to list

In [226]:
collection = [doc['content'] for doc in list(raw_docs.values())]

### Fit TfidfVectorizer on Collection

#### Choice of TfidfVectorizer arguments
- **ngram_range = (1,2)** --> to include both unigram and bigram
- **max_df = 0.8** --> Ignore terms that are in more than 80% of the docs i.e. most frequent terms such as stop words
- **sublinear_td** = True --> Replace TF with 1 + Log(Tf)
- **strip_accents** = 'unicode' --> character normalization

Default Values:
- lowercase = True
- norm = 'l2'
- use_idf = True

In [227]:
vectorizer = TfidfVectorizer(ngram_range= (1,2), max_df=0.8, sublinear_tf=True, strip_accents='unicode')
tfidf = vectorizer.fit_transform(collection)

In [228]:
tfidf_table = pd.DataFrame(tfidf.T.toarray(), columns=raw_docs.keys(), index=vectorizer.get_feature_names_out())

Tfidf Table of (Vocabulary, Docs) size

In [229]:
tfidf_table

Unnamed: 0,Doc1,Doc2,Doc3,Doc4,Doc5,Doc6,Doc7,Doc8,Doc9,Doc10,...,Doc41,Doc42,Doc43,Doc44,Doc45,Doc46,Doc47,Doc48,Doc49,Doc50
00,0.005916,0.0,0.012193,0.004116,0.020069,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006702,0.0
00 00,0.000000,0.0,0.000000,0.000000,0.035909,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
00 000,0.000000,0.0,0.000000,0.000000,0.037150,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
00 01,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
00 018,0.000000,0.0,0.000000,0.000000,0.008892,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ωωl,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.018579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ωωl 2dω,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.010973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ωωl ηω2,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.010973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ωωll,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.010973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


### Testing

Replace path with file of your choice

In [238]:
doc = extract_text("test_doc.pdf")

Transforming test file

In [231]:
test_transformed = vectorizer.transform([doc]).T.toarray().flatten()

Computing dot product with each doc tfidf value

In [232]:
for docs in tfidf_table.columns:
    raw_docs[docs]['test_similarity'] = np.dot(test_transformed, tfidf_table[docs])

Dataframe with ranked values of test similarity

In [233]:
raw_docs_info = pd.DataFrame.from_dict(raw_docs, orient='index')[['category','path', 'test_similarity']]
raw_docs_info['rank'] = raw_docs_info['test_similarity'].rank(ascending=False)
raw_docs_info.sort_values("rank", inplace=True)

### Displaying top 10 most similar documents

In [234]:
raw_docs_info[:10]

Unnamed: 0,category,path,test_similarity,rank
Doc24,nlp,research_papers/nlp/2312.04649.pdf,0.222597,1.0
Doc27,nlp,research_papers/nlp/2312.05589.pdf,0.130769,2.0
Doc30,nlp,research_papers/nlp/2311.16588.pdf,0.128944,3.0
Doc26,nlp,research_papers/nlp/2312.03736.pdf,0.122977,4.0
Doc25,nlp,research_papers/nlp/2312.15020.pdf,0.106022,5.0
Doc29,nlp,research_papers/nlp/2312.10432.pdf,0.103217,6.0
Doc22,nlp,research_papers/nlp/2311.16965.pdf,0.092861,7.0
Doc28,nlp,research_papers/nlp/2311.17354.pdf,0.09216,8.0
Doc21,nlp,research_papers/nlp/2312.04944.pdf,0.089988,9.0
Doc23,nlp,research_papers/nlp/2312.01221.pdf,0.084308,10.0


### Based on highest cosine similarity the category of the Document is:

In [235]:
raw_docs_info['category'].iloc[0]

'nlp'