In [1]:
from collections import Counter
import math

import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

%matplotlib inline
stopwords = set(stopwords.words('english'))

# Cosine similarity

First we read in the data and process the abstracts by removing `NaN`s, converting to lowercase, and removing stopwords.

In [2]:
df = pd.read_csv("../data/full_arxiv_cs_clean.csv", index_col="Unnamed: 0", low_memory=False)
selected = df[~df.abstract.isnull()][['abstract', 'id']]
clean_abstracts = selected.abstract.str.lower()
clean_abstracts = clean_abstracts.str.replace(r"[.()$,:;\"%{}\-\\/<>+=_~'^]", " ")
clean_abstracts = clean_abstracts.str.replace(r" {2,}", " ")
clean_abstracts = clean_abstracts.apply(lambda x: [i for i in x.strip().split()
                                                   if i not in stopwords])

Next, we can generate word counts for each document with a Python `Counter` like so.

In [3]:
ca_counts = clean_abstracts.apply(lambda x: Counter(x))

To get the full list of words in all documents, we can go through each `Counter` and add the keys to a `set` of words.

In [4]:
unique_words = set()

def add_words(s, doc):
    [s.add(i) for i in doc.keys()]
    
ca_counts.apply(lambda x: add_words(unique_words, x));

We then convert the `set` to a `list` to finalize the (arbitrary) order.

In [5]:
unique_words = list(unique_words)

In [142]:
def dot(a, b):
    """Takes two sparse vectors and finds their dot products.
    
    Parameters
    ----------
    a, b -- Sparse vectors represented as Python Counters
    
    Returns
    -------
    c -- Dot product of `a` and `b`
    """
    c = 0
    for k in a.keys() & b.keys():
        c += a[k] * b[k]
    return c

def norm(a):
    """Finds the L2 norm of a sparse vector.
    
    Parameters
    ----------
    a -- Sparse vector represented as a Python Counter
    
    Returns
    -------
    b -- Norm of `a`
    """
    return np.linalg.norm(np.fromiter(a.values(), dtype=int), 2)

def cosine_similarity(doc1, doc2):
    """Finds the cosine similarity between two documents.
    
    Parameters
    ----------
    doc1, doc2 -- Sparse vectors of word counts represented as Python Counters
    
    Returns
    -------
    Cosine similarity between `doc1` and `doc2`
    """
    return dot(doc1, doc2)/(norm(doc1)*norm(doc2))

def cosine_distance(doc1, doc2):
    """Finds the cosine distance between two documents.
    
    Parameters
    ----------
    doc1, doc2 -- Sparse vectors of word counts represented as Python Counters
    
    Returns
    -------
    Cosine distance between `doc1` and `doc2`
    """
    return 1-cosine_similarity(doc1, doc2)

In [178]:
def most_similar(doc, n=100):
    """Returns the n most similar documents.
    
    Parameters
    ----------
    doc -- Document of interest
    n -- Number of results to return (optional)
    
    Returns
    -------
    similarities -- A size-n vector of document similarities, ignoring the same document"""
    similarities = ca_counts.apply(lambda x: cosine_similarity(doc, x))
    return similarities.sort_values(ascending=False)[1:(1+n)]

In [49]:
## Running this on the full data set would be ~160 Gb

# similarities = dict()

# def get_similarities(row):
#     id_ = row['index']
#     if id_ % 100 == 0:
#         print(id_)
#     similarities.update({id_: dict()})
#     ca_counts[id_:1000].reset_index().apply(
#         lambda doc: similarities[id_].update(
#             {doc['index']: cosine_similarity(row.abstract, doc.abstract)}
#         ), axis=1)
        
# ca_counts[:1000].reset_index().apply(get_similarities, axis=1);