In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import spacy


In [2]:
# Load the data
records = pd.read_csv('../input/scirate-quant-ph/scirate_quant-ph.csv', dtype={"id": str},
                        index_col = 0)


In [3]:
# see the first rows of data
records.head()

In [4]:
# check for missing values in each column
records.isnull().sum()

In [5]:
# the dimension of data
records.shape

# 1. EDA

In [6]:
records['date_parsed'] = pd.to_datetime(records[["year", "month", "day"]], format="%Y/%m/%d")
records["dayofweek"] = records['date_parsed'].dt.day_name()


fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, sharey=True, figsize=(15, 6))
sns.stripplot(x="day", y="scites", data=records, jitter=True, ax=ax1)
sns.stripplot(x="month", y="scites", data=records, jitter=True, ax=ax2)
sns.stripplot(x="year", y="scites", data=records, jitter=True, ax=ax3)
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.stripplot(x="dayofweek", y="scites", data=records, order=days, jitter=True, ax=ax4)

In [7]:
# Preprocess the text with function processtext()

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
lemma = WordNetLemmatizer() # for Lemmatisation


def processtext(text):
    text=re.sub("[^a-zA-Z]"," ",text) # Filter to allow only alphabets in text
    text=text.lower() # Convert the text to lowercase to maintain consistency
    tokens=word_tokenize(text) # Tokenize the text
    # Remove stopwords and punctuations
    tokens=[token for token in tokens if token not in stop_words and token not in punctuations] 
    tokens=[lemma.lemmatize(token) for token in tokens] # Lemmatisation of tokens
    text=" ".join(tokens)
    return text


In [8]:
# Apply function processtext() to each abstract and save the new processed abstract in a new column "processed_text" 
records["processed_abstract"] = records.abstract.apply(lambda x: processtext(x))

# 2. Feature Engineering

**2.1 Vectorization (word embedding) of text (abstract, title) features**

The first step of featute engineering: I need to represent the text in the data numerically. Instead of using a bag of words representations for texts (abstracts, titles, authors), I will use word embeddings (or word vectors). SpaCy provides embeddings accessible by a large language model (en_core_web_lg).

In [9]:
# Load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')


Here, I will load the abstract, title, authors and convert each of them to document vectors (of 300-dimensional).

In [None]:
# get the vectors for abstracts
with nlp.disable_pipes():
    vectors = np.array([nlp(abstract).vector for abstract in records.processed_abstract])
    
vectors.shape

In [None]:
# get the vectors for titles
with nlp.disable_pipes():
    vectors_title = np.array([nlp(title).vector for title in records.title])
    
vectors_title.shape

In [None]:
# get the vectors for authors
with nlp.disable_pipes():
    vectors_authors = np.array([nlp(authors).vector for authors in records.authors])
    
vectors_authors.shape

The second step of featute engineering: I will make a new column (records.dayofweek) indicating the day-of-week for each paper (or for each date). 

**2.2 Submission_date_of_paper features**

In [10]:
records['date_parsed'] = pd.to_datetime(records[["year", "month", "day"]], format="%Y/%m/%d")
records["dayofweek"] = records['date_parsed'].dt.day_name()
records.dayofweek = records.dayofweek.map({"Monday":1, "Tuesday":2, "Wednesday":3, "Thursday":4, "Friday":5, "Saturday":6, "Sunday":7})

**2.3 Daily_order_of_paper features**

The third step of feature engineering: I will make a new dataframe (daily_order) including the order of the paper per date and the total papers per date (daily_order[["total", "order"]]), and will concatenate it to the original dataset. 


In [11]:
daily_order = []
for date in pd.to_datetime(pd.DataFrame(records['date_parsed'].unique(), columns=["date"], dtype='object')["date"]):
    same_day_df = records[(records["year"] == date.year) &
                       (records["month"] == date.month) &
                       (records["day"] == date.day)].sort_values("id")
    total = len(same_day_df)
    daily_order.extend([[total, order, scites, (same_day_df["id"]).iloc[order]] for order, scites in enumerate(same_day_df["scites"])])
daily_order = pd.DataFrame(daily_order, columns=["total", "order", "scites_2", "id"])

records = pd.concat([records, daily_order[["total", "order"]]], axis=1)

In [12]:
g = sns.FacetGrid(daily_order, col="total", col_wrap=3, hue="order")
g.map(sns.scatterplot, "order", "scites_2")

fig, ax5 = plt.subplots(1, figsize=(15, 6))
sns.stripplot(x="order", y="scites_2", data=daily_order, jitter=True, ax=ax5)

**2.4 Is any frequent author in the list of authors**

The fourth step of feature engineering: I will find the frequency of each single author in the document, and then create a series of the most frequent authors (frequent_authors) by only keeping the authors with the frequency more than 15.  
Next, by defining a function (any_frequent_authors), I will check if the list of authors for each paper includes any of the frequent authors, and finally will add a binary column to the records data (records['any_frequent_authors']) based on this analysis. 

In [13]:
frequent_authors = pd.Series((";".join(records["authors"])).split(";")).value_counts().iloc[:37]

def any_frequent_authors(authors, frequent_authors):
    for frequent_author in frequent_authors.index:
        if frequent_author in authors:
            return 1
    return 0


records['any_frequent_authors'] = records.authors.str.split(";").apply(lambda x: any_frequent_authors(x, frequent_authors))


Now I will concatenate all the new engineered features to the existing numerical features to create X. 

In [15]:
X_ = records[['year', 'month', 'day', 'dayofweek', 'total', 'order', 'any_frequent_authors']].values

In [None]:
# concatenate all the numerical features to create X
X = np.concatenate([vectors, vectors_title, vectors_authors, X_], axis=1)

Here, I want to define my project as a classification task between hot or not-hot topics in quantum physics based on the number of paper's citation. 
I will assume papers with scites>16 as hot topic (with label 1) and with scites<=16 as not-hot topic (with label 0).
Note 16 is the median for citations. (records.scites.median()=16). I chose median to have a balanced classification data. 
Based on this I will create the target series. 

In [44]:
records.scites.median()

In [48]:
# Create the target series
y = (records.scites > 16).astype(int)

In [46]:
g = sns.countplot(y)
g.set_xticklabels(["not-hot", "hot"])
g.set_xlabel("topic")

In [47]:
(y.value_counts()).plot(kind = "pie", autopct='%1.1f')

46.7% of papers are in hot-topic class, and 53.3% are not hot topic. The classes are not balanced. 

I will first split the data (X, y) to train-data and test-data, and then train different classification ML models.

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, stratify=y, random_state=1)


# 3. Machine Learning Models

In [None]:
def ML_models_performance(model, X_train, y_train ,X_test ,y_test, y_pred, model_name):
 
    performance_df=pd.DataFrame({'Train_accuracy':model.score(X_train,y_train),"Test_accuracy":model.score(X_test,y_test),
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=[model_name])
    return performance_df

In [49]:
from sklearn.linear_model import LogisticRegression
# Create the LogisticRegression model
lr = LogisticRegression(solver='sag')
# Fit the model
lr.fit(X_train, y_train)
# Use the trained model to predict
y_pred = lr.predict(X_test)
# model accuracy
print(f'Model train accuracy: {lr.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {lr.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [None]:
lr_performance = ML_models_performance(lr, X_train, y_train ,X_test ,y_test, y_pred, "Logisitc Regression")
lr_performance

In [None]:
from sklearn.svm import LinearSVC
# Create the LinearSVC model with some regularization
LSVC = LinearSVC(random_state=1, dual=False, C=1/2)
# Fit the model
LSVC.fit(X_train, y_train)
# Use the trained model to predict
y_pred = LSVC.predict(X_test)
# model accuracy
print(f'Model train accuracy: {LSVC.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {LSVC.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')


In [None]:
LSVC_performance = ML_models_performance(LSVC, X_train, y_train ,X_test ,y_test, y_pred, "LinearSVC")
LSVC_performance

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Create the RandomForest model
rfc = RandomForestClassifier(random_state=0)
# Fit the model
rfc.fit(X_train, y_train)
# Use the trained model to predict
y_pred = rfc.predict(X_test)
# model accuracy
print(f'Model train accuracy: {rfc.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {rfc.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [None]:
rfc_performance = ML_models_performance(rfc, X_train, y_train ,X_test ,y_test, y_pred, "Random Forest")
rfc_performance

In [None]:
from sklearn.svm import SVC
# Create the SVC model with some regularization
svcl = SVC(kernel='linear', random_state=1, C=1/2)
# Fit the model
svcl.fit(X_train, y_train)
# Use the trained model to predict
y_pred = svcl.predict(X_test)
# model accuracy
print(f'Model train accuracy: {svcl.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {svcl.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [None]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance])
comparison_df

# 4. Finding similar papers

One of the advantages of using word vectors is that we can find papers with similar content. Because papers (more importantly their abstracts) with similar content generally have similar vectors. Similar papers can be found by measuring the similarity between the vectors of their abstracts. A metric for this is the cosine similarity measuring the angle between two vectors, and it is the inner product of two vectors, divided by the magnitudes of each vector. The cosine similarity varies between -1 and 1, which corresponds to complete opposite and perfect similarity, respectively.

In [None]:
# A given abstract
abstract = """Free-space channels provide the possibility of establishing continuous-variable quantum key 
distribution in global communication networks. However, the fluctuating nature of transmissivity in these 
channels introduces an extra noise which reduces the achievable secret key rate. 
We consider two classical postprocessing strategies, postselection of high-transmissivity data and data 
clusterization, to reduce the fluctuation-induced noise of the channel. We undertake the investigation of
such strategies utilizing a composable security proof in a realistic finite-size regime against both
collective and individual attacks. We also present an efficient parameter estimation approach to
estimate the effective Gaussian parameters over the postselected data or the clustered data.
Although the composable finite-size effects become more significant with the postselection
and clusterization both reducing the size of the data, our results show that these strategies are
still able to enhance the finite-size key rate against both individual and collective attacks with
a remarkable improvement against collective attacks, 
even moving the protocol from an insecure regime to a secure regime under certain conditions.."""

def cosine_similarity(a, b):
    return np.dot(a, b)/np.sqrt(a.dot(a)*b.dot(b))

# Get the vecor of the given abstract
abstract_vec = nlp(abstract).vector

# Calculate the mean for the abstract vectors, with shape (300,)
vec_mean = vectors.mean(axis=0)
# Subtract the mean from the vectors
centered = vectors - vec_mean

# Calculate similarities between the given abstract and each abstract in the dataset
# We also need to subtract the vec_mean from the abstract_vec
sims = np.array([cosine_similarity(abstract_centered, abstract_vec - vec_mean) for abstract_centered in centered])

# Get the index for the most similar abstract
most_similar = sims.argmax()
print(records.iloc[most_similar].abstract)
print(f"cosine_similarity is {sims[most_similar]:.3f}")


Now I will change the strategy for the first step of the featute engineering: I need to represent the text in the data numerically. Instead of using the word embeddings (or word vectors), I will use a bag of words representations for texts (abstracts, titles, authors). In fact, I will convert a collection of text features to a matrix of word counts using TfidfVectorizer.


# 5. Use a different vectorization technique to vectorize text features

In [50]:
# text features
X1 = records.authors
X2 = records.title
X3 = records.processed_abstract

# convert a collection of text features to a matrix of word counts
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X1 = (tfidf.fit_transform(X1)).toarray()
X2 = (tfidf.fit_transform(X2)).toarray()
X3 = (tfidf.fit_transform(X3)).toarray()

# concatenate all the numerical features to create new X
X = np.concatenate([X1, X2, X3, X_], axis=1)
# split the data (X, y) to train-data and test-data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, stratify=y, random_state=1)

I will now use the same classification models (used earlier) to train the new X. 

In [51]:
def ML_models_performance(model, X_train, y_train ,X_test ,y_test, y_pred, model_name):
 
    performance_df=pd.DataFrame({'Train_accuracy':model.score(X_train,y_train),"Test_accuracy":model.score(X_test,y_test),
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=[model_name])
    return performance_df

In [52]:
from sklearn.linear_model import LogisticRegression
# Create the LogisticRegression model
lr = LogisticRegression(solver='sag')
# Fit the model
lr.fit(X_train, y_train)
# Use the trained model to predict
y_pred = lr.predict(X_test)
# model accuracy
print(f'Model train accuracy: {lr.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {lr.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [53]:
lr_performance = ML_models_performance(lr, X_train, y_train ,X_test ,y_test, y_pred, "Logisitc Regression")
lr_performance

In [54]:
from sklearn.svm import LinearSVC
# Create the LinearSVC model with some regularization
LSVC = LinearSVC(random_state=1, dual=False, C=1/1.2)
# Fit the model
LSVC.fit(X_train, y_train)
# Use the trained model to predict
y_pred = LSVC.predict(X_test)
# model accuracy
print(f'Model train accuracy: {LSVC.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {LSVC.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [55]:
LSVC_performance = ML_models_performance(LSVC, X_train, y_train ,X_test ,y_test, y_pred, "LinearSVC")
LSVC_performance

In [56]:
from sklearn.ensemble import RandomForestClassifier
# Create the RandomForest model
rfc = RandomForestClassifier(random_state=0)
# Fit the model
rfc.fit(X_train, y_train)
# Use the trained model to predict
y_pred = rfc.predict(X_test)
# model accuracy
print(f'Model train accuracy: {rfc.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {rfc.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [30]:
rfc_performance = ML_models_performance(rfc, X_train, y_train ,X_test ,y_test, y_pred, "Random Forest")
rfc_performance

In [58]:
from sklearn.svm import SVC
# Create the SVC model with some regularization
svcl = SVC(kernel='linear', random_state=1, C=1/2)
# Fit the model
svcl.fit(X_train, y_train)
# Use the trained model to predict
y_pred = svcl.predict(X_test)
# model accuracy
print(f'Model train accuracy: {svcl.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {svcl.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [60]:
svcl_performance = ML_models_performance(svcl, X_train, y_train ,X_test ,y_test, y_pred, "SVC_Linear_Kernel")
svcl_performance

In [61]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance, svcl_performance])
comparison_df

It seems using the bag of words representation of texts plus using either LinearSVC or the SVM model with linear kernel gives the best model in terms of the accuracy and f1_score.
In order to improve the ML models for this classification, I guess we can play with the citation threshold for the hot and not-hot topics.
Also, we can play with the hyper parameter tuning of the different models (such as changing the regularization parameter), because of the overfitting. 
Also, the number of data is limited here (only 1932). Overfitting can be decreased with increasing the data. 