## LOAD DATA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import Pandas
import pandas as pd

# Load Movies Metadata
metadata = pd.read_csv("/content/drive/MyDrive/Buom/ver4/Paper.csv", low_memory=False)

metadata.head(10)


Unnamed: 0,paper_id,title,abstract,published_year
0,1,Computers and Intractability: A Guide to the T...,\N,1979
1,2860,Substructure similarity search in graph databases,Advanced database systems face a great challen...,2005
2,2884,Inferring AS Relationships: Dead End or Lively...,Recent techniques for inferring business relat...,2005
3,2885,Bidimensionality: new connections between FPT ...,We demonstrate a new connection between fixed-...,2005
4,2892,Hardness of the Undirected Edge-Disjoint Paths...,In the Edge-Disjoint Paths problem with Conges...,2005
5,2899,Aggregate operators in probabilistic databases,Though extensions to the relational data model...,2005
6,2920,"Multicommodity flow, well-linked terminals, an...",We study multicommodity routing problems in bo...,2005
7,2953,Optimal Component Composition for Scalable Str...,Stream processing has become increasingly impo...,2005
8,2968,Constrained Bandwidth Allocation in MultiSenso...,Sensor networks are increasingly seen as a so-...,2005
9,3178,A dissimilarity-based approach for Classification,The Nearest Neighbor classifier has shown to b...,2005


## GET NEWEST PAPERS



In [3]:
df = metadata.copy()
df = df.query("published_year != r'\\N'")
df['published_year'] = df['published_year'].astype(int)
df = df.sort_values('published_year', ascending=False)
df[['paper_id', 'title', 'published_year']].head(20)

Unnamed: 0,paper_id,title,published_year
16038,1832722,Implicators based on binary aggregation operat...,2005
19528,2735743,The Challenge of Visualizing Patient Histories...,2005
6506,507928,Multi-Server Queueing Systems with Multiple Pr...,2005
13439,1530576,An Ontology-Based Pattern Mining System for Ex...,2005
6492,505622,A Web Services-Based Universal Approach to Het...,2005
19537,2738129,The Bunch-Active Shape Model,2005
13455,1535232,Cryptanalysis of an MPEG-Video Encryption Sche...,2005
19532,2736480,Fuzzy logic based multi-optimum programming in...,2005
19531,2736422,Meta-Toys Workshop and Environment with Squeak,2005
13457,1535531,A generic framework of user attention model an...,2005


## Get Papers published in a period

In [4]:
def papers_in_period(year_begin, year_end):
    q_papers = df.copy().loc[(df['published_year'] >= year_begin) & (df['published_year'] <= year_end)]
    return q_papers


In [5]:
papers = papers_in_period(2000,2001)
papers[['paper_id', 'title', 'published_year']].head(20)

Unnamed: 0,paper_id,title,published_year
22597,3453023,Persistent conversation: a dialog between rese...,2001
10681,1234402,Controlling Overfitting in Classification-Tree...,2001
22208,3392768,The Propel Distributed Services Platform,2001
22070,3349978,S cilab to S cilab //: The O uragan project,2001
22789,3471951,Mapping Database Content to XML Pages - A Meta...,2001
22094,3355496,Parallel Processing for Branching Simulation,2001
22681,3460446,Dynamic content acceleration: a caching soluti...,2001
10611,1223590,An Adaptive Probe-Based Technique to Optimize ...,2001
10783,1251779,A New Control Scheme for Combustion Processes ...,2001
10488,1209448,A Review of the First Cooperative Projects in ...,2001


## CONTENT-BASED RECOMMENDER


In [6]:
# Print plot overviews of the first 5 movies.
metadata['abstract'].head(20)


0                                                    \N
1     Advanced database systems face a great challen...
2     Recent techniques for inferring business relat...
3     We demonstrate a new connection between fixed-...
4     In the Edge-Disjoint Paths problem with Conges...
5     Though extensions to the relational data model...
6     We study multicommodity routing problems in bo...
7     Stream processing has become increasingly impo...
8     Sensor networks are increasingly seen as a so-...
9     The Nearest Neighbor classifier has shown to b...
10    The WWW is a new advertising media in recent y...
11    The study of genome rearrangements, the evolut...
12    As an effective way for dimensionality reducti...
13    Extending the single optimized spaced seed of ...
14    Combinatorial optimization is often used to "p...
15    We show that link interferences in multihop wi...
16    Data design has been characterized as a proces...
17    Routing in wireless network is challenging

In [7]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['title_abstract'] = metadata.apply(lambda row: f"{row['title']} {row['abstract']}", axis=1)
metadata['title_abstract'] = metadata['title_abstract'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['title_abstract'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(23587, 49482)

Number of papers: 23587 \
Number of terms: 49482

### Using cosine similarity to find similar papers for a paper

In [8]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) #not enough memory


In [14]:
# Save cosine sim
import numpy as np
np.save('/content/drive/MyDrive/Buom/ver4/cosine_similarity.npy', cosine_sim)

In [10]:
# Mapping paper_id to row number in dataframe
id_mapping = {index: i for i, index in enumerate(metadata['paper_id'].unique())}

In [11]:
# Function that takes in paper_id as input and outputs most similar papers
def get_recommendations(idx, cosine_sim=cosine_sim):
    # Get the pairwsie similarity scores of all papers with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the papers based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar papers
    sim_scores = sim_scores[1:11]

    # Get the paper id
    paper_ids = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[paper_ids]


In [12]:
id_mapping

{1: 0,
 2860: 1,
 2884: 2,
 2885: 3,
 2892: 4,
 2899: 5,
 2920: 6,
 2953: 7,
 2968: 8,
 3178: 9,
 3247: 10,
 3265: 11,
 3449: 12,
 3472: 13,
 3473: 14,
 3519: 15,
 3533: 16,
 3536: 17,
 3549: 18,
 3553: 19,
 3556: 20,
 3576: 21,
 3645: 22,
 3655: 23,
 3666: 24,
 3678: 25,
 3683: 26,
 3716: 27,
 3826: 28,
 3905: 29,
 3985: 30,
 4070: 31,
 4080: 32,
 4087: 33,
 4088: 34,
 4120: 35,
 4203: 36,
 4217: 37,
 4230: 38,
 4347: 39,
 4380: 40,
 4435: 41,
 4511: 42,
 4602: 43,
 4603: 44,
 4621: 45,
 4660: 46,
 4663: 47,
 4673: 48,
 4688: 49,
 4720: 50,
 4729: 51,
 4736: 52,
 4746: 53,
 4761: 54,
 4805: 55,
 4808: 56,
 4831: 57,
 4892: 58,
 5036: 59,
 5043: 60,
 5051: 61,
 5052: 62,
 5053: 63,
 5066: 64,
 5075: 65,
 5083: 66,
 5096: 67,
 5149: 68,
 5150: 69,
 5241: 70,
 5407: 71,
 5421: 72,
 5438: 73,
 5443: 74,
 5445: 75,
 5446: 76,
 5448: 77,
 5472: 78,
 5473: 79,
 5485: 80,
 5492: 81,
 5506: 82,
 5539: 83,
 5570: 84,
 5597: 85,
 5625: 86,
 5724: 87,
 5796: 88,
 5799: 89,
 5800: 90,
 5803: 91,
 

In [13]:
input_paper_idx = 6135
# 3338907
# 469306
# 170
# 202
get_recommendations(id_mapping[input_paper_idx])

2391    A tight bound on approximating arbitrary metri...
2397      On approximating planar metrics by tree metrics
3044    Approximating min-sum k -clustering in metric ...
4930    On approximating arbitrary metrics by tree met...
3876    Approximation algorithms for the metric labeli...
4261    Probabilistic approximation of metric spaces a...
4653              The Finite Capacity Dial-A-Ride Problem
4506                   On Approximation Algorithms for #P
6171    Deterministic Polylog Approximation for Minimu...
6225    A Linear Programming Formulation and Approxima...
Name: title, dtype: object