In [2]:
import pandas as pd
#dataset includes a list of scientific research articles
df = pd.read_csv('completed_clean_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2875 entries, 0 to 2874
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   AUTHOR    2873 non-null   object 
 1   JOURNAL   2777 non-null   object 
 2   TITLE     2875 non-null   object 
 3   YEAR      2866 non-null   float64
 4   ABSTRACT  2875 non-null   object 
dtypes: float64(1), object(4)
memory usage: 112.4+ KB


In [11]:
import string
import nltk
import re
import numpy as np

In [12]:
#text clean-up
stop_words = nltk.corpus.stopwords.words("english")

def normalize_document(doc):
  doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip()
  doc = doc.translate(str.maketrans('', '', string.punctuation))
  tokens = nltk.word_tokenize(doc)
  filtered_tokens = [token for token in tokens if token not in stop_words]
  doc = ' '.join(filtered_tokens)
  return doc
  
normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['ABSTRACT']))
len(norm_corpus)

2875

Calculating the cosine similarity between the abstracts of the attached documents.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

#cosine similarity

(2875, 30660)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2865,2866,2867,2868,2869,2870,2871,2872,2873,2874
0,1.0,0.022779,0.032152,0.02844,0.062236,0.005699,0.009565,0.011017,0.00681,0.000595,...,0.001298,0.004186,0.04168,0.008736,0.002084,0.016563,0.039453,0.047954,0.02344,0.048615
1,0.022779,1.0,0.024258,0.013722,0.025037,0.018276,0.018347,0.047968,0.002435,0.012694,...,0.005068,0.007235,0.014425,0.034319,0.029571,0.023079,0.029015,0.051089,0.032311,0.011365
2,0.032152,0.024258,1.0,0.043408,0.040312,0.007826,0.026156,0.017533,0.014365,0.024885,...,0.011575,0.013655,0.01475,0.016498,0.028598,0.018393,0.022295,0.013188,0.035566,0.028252
3,0.02844,0.013722,0.043408,1.0,0.077761,0.008686,0.009958,0.008669,0.03176,0.017833,...,0.002148,0.012177,0.001676,0.019169,0.020668,0.0,0.006758,0.00461,0.001366,0.0
4,0.062236,0.025037,0.040312,0.077761,1.0,0.003917,0.008909,0.015062,0.015367,0.023163,...,0.019208,0.004549,0.012678,0.023929,0.007849,0.016027,0.038139,0.028702,0.013295,0.020856


Using our moving recommender - pick a single article (under TITLE) and recommend five other related articles.

In [15]:
#Article recommender based on movie recommender
def article_recommender(title, articles, doc_sims):
  article_idx = np.where(articles == title)[0][0]
  article_similarities = doc_sims.iloc[article_idx].values
  similar_article_idxs = np.argsort(-article_similarities)[1:6]
  similar_articles = articles[similar_article_idxs]
  return similar_articles
  
article_recommender('encoding textual criticism',
  df['TITLE'].values,
  doc_sim_df
  )

array(['some problems of tei markup and early printed books',
       'the design of the tei encoding scheme',
       'modifying the tei dtd the case of korean dictionaries',
       'the encoding of spoken texts', 'the tei history goals and future'],
      dtype=object)

Discussion Questions
Describe a set of texts and research question that interests you that could be explored using this method. Basically, what is a potential application of this method to another area of research?

ANSWER: Among multiple fields that use this method, my idea could be to see if there is a significant difference between short text written by English native speakers and people for whom it is a “second language”. The method would be to sample texts from both groups and then generate similar text samples. Based on the algorithm generated similar text samples, we would than be able to identify if the supplier of these was a native speaker or not. Potentially this could be then further developed with a unsupervised learning algorithm.