In [1]:
# ! python -m spacy download en_core_web_sm

In [13]:
import pandas as pd
import ast
import numpy as np
import pickle
from copy import deepcopy
import math
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
import unidecode
import spacy
import re

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
# data = pd.read_pickle("data_all_sbert_embeddings_retro.pkl")

In [5]:
data = pd.read_csv("papers.csv", usecols=["fos","doi","authors","year","title","id","references","n_citation","cleaned_abstracts"])

## Clean titles

In [8]:
def remove_whitespace_chars(text):
#     no_returns = text.replace("\r", " ")
#     no_whitelines = no_returns.replace("\n", " ")
    return " ".join(text.split())

def clean_sentence(sentence):
    
    text = re.sub(r'[^a-zA-Z\']', ' ', sentence)
    
    # remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    return text

CUSTOM_STOPS = {"abstract", "background", "background.", "abstract.", "method", 
                "result", "conclusion", "conclusions", "discussion", "PRON", "registration", "url"}

def has_links(token):
    if any(substring in token for substring in {"link_type", "ref?", "=%", "html", "http", "www", ".com", "access_num"}):
        return True
    else:
        return False
    
def check_for_mostly_numeric_string(token):
    
    int_chars = []
    alpha_chars = []
    
    for ch in token:
        if ch.isnumeric():
            int_chars.append(ch)
        elif ch.isalpha():
            alpha_chars.append(ch)
            
    if len(int_chars) > len(alpha_chars):
        return True
    else:
        return False

In [9]:
clean_titles =  [remove_whitespace_chars(unidecode.unidecode(a)) for a in data.title if a != "None-placeholder"]

In [10]:
clean_titles[0]

'Texture processing of extruded YBa2Cu3O6+x wires by zone melting'

In [11]:
titles = nlp.pipe(clean_titles, disable=["ner", "parser"])

In [14]:
cleaned_titles = []
for doc in titles:
    lowercased_sents_without_stops = []
    for token in doc:
        if not token.pos_ in {"SYM" ,"PUNCT"} \
        and len(token) > 1 \
        and not has_links(token.lower_) \
        and not check_for_mostly_numeric_string(token.lower_) \
        and not re.sub(r'[^\w\s]','',token.lemma_) in CUSTOM_STOPS:
            lowercased_sents_without_stops.append(token.lower_)
            
    sentence = ' '.join(lowercased_sents_without_stops)

    cleaned_titles.append(sentence)

## TF-IDF

In [19]:
MAX_FEATURES = 10000 #Based on the rough analysis of word frequencies in the data.

In [24]:
CORPUS_MERGE = [sentence for sublist in [ast.literal_eval(ca) for ca in data.cleaned_abstracts] for sentence in sublist]
CORPUS_MERGE.extend(cleaned_titles)

In [114]:
from collections import Counter

def getWordFrequency(corpus):
    freq = Counter()
    for sentence in corpus:
        for word in sentence.split():
            freq[word] += 1
    return freq

In [116]:
getWordFrequency(CORPUS_MERGE).most_common()

[('the', 2254287),
 ('of', 1971662),
 ('and', 1512181),
 ('in', 1231434),
 ('to', 856990),
 ('with', 536353),
 ('for', 446250),
 ('that', 367972),
 ('is', 360721),
 ('was', 310322),
 ('by', 301244),
 ('were', 280979),
 ('as', 217862),
 ('on', 211026),
 ('are', 208957),
 ('or', 206416),
 ('from', 200324),
 ('this', 195597),
 ('patients', 186040),
 ('an', 169153),
 ('at', 163072),
 ('be', 156015),
 ('cells', 152408),
 ('these', 139372),
 ('have', 122484),
 ('not', 122320),
 ('cell', 109915),
 ('which', 104210),
 ('risk', 99560),
 ('study', 97315),
 ('between', 93877),
 ('been', 84382),
 ('has', 84189),
 ('s', 83559),
 ('disease', 82083),
 ('than', 79559),
 ('but', 79511),
 ('associated', 79013),
 ('may', 76244),
 ('cancer', 76053),
 ('data', 74593),
 ('after', 72320),
 ('also', 71009),
 ('can', 70139),
 ('both', 67610),
 ('studies', 67553),
 ('protein', 66895),
 ('using', 66635),
 ('two', 66404),
 ('all', 64820),
 ('more', 64578),
 ('high', 64197),
 ('expression', 63230),
 ('gene', 61126

# Different vectorizers

In [21]:
#TODO: play around with parameters 
tfidf_vectorizer = TfidfVectorizer(max_features = MAX_FEATURES, stop_words="english", max_df=0.9, min_df=0.01, ngram_range=(1, 2)) 

In [25]:
tfidf_vectorizer.fit(CORPUS_MERGE)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=10000,
                min_df=0.01, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [26]:
def calculate_average_tfidf_vector(list_of_sentences, vectorizer=tfidf_vectorizer):
    vectors = vectorizer.transform(list_of_sentences)
    return np.array(np.mean(vectors, axis=0))

In [27]:
def calculate_tfidf_vector_paper(row, strategy="merge", vectorizer=tfidf_vectorizer):
    abstract = row.cleaned_abstracts
    title = row.title
    
    title_vector = vectorizer.transform([title]).toarray()[0]
    
    if strategy == "merge":
        together = title + " " + abstract
        average = vectorizer.transform([together]).toarray()[0]
    elif strategy == "separate":
        abstract_vector = vectorizer.transform([abstract]).toarray()[0]
        average = np.mean([abstract_vector, title_vector], axis=0)
    else:
        print("Warning: wrong strategy is used. Use either 'merge' or 'separate'. Proceeding using the 'merge' strategy.")
        together = title + " " + abstract
        average = vectorizer.transform([together]).toarray()
    
    return average

In [34]:
tfidf_vectorizer.transform([clean_titles[4]]).toarray()[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.61682708,
       0.        , 0.        , 0.        , 0.        , 0.64192757,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     