In [None]:
!pip install feedparser scikit-learn numpy pandas



In [None]:
import urllib.request
import feedparser
import pandas as pd
import time
from datetime import datetime

def search_arxiv(keyword, start=0, max_results=1000):
    """
    Mengambil data paper dari ArXiv dengan kategori CS spesifik
    """
    print(f"Mencari paper dengan keyword: {keyword} (start: {start}, max: {max_results})")

    base_url = 'http://export.arxiv.org/api/query?'
    # Menggunakan kategori CS spesifik
    query = f'search_query=all:{keyword}&start={start}&max_results={max_results}&sortBy=submittedDate&sortOrder=descending'
    query = query.replace(' ', '+')
    url = base_url + query

    try:
        response = urllib.request.urlopen(url)
        feed = feedparser.parse(response.read())

        papers = []
        for entry in feed.entries:
            # Ambil kategori
            categories = [t['term'] for t in entry.tags] if 'tags' in entry else []

            paper = {
                'title': entry.title,
                'authors': ', '.join(author.name for author in entry.authors),
                'published_date': entry.published,
                'summary': entry.summary,
                'link': entry.link,
                'categories': ', '.join(categories),
                'main_category': entry.arxiv_primary_category['term'] if 'arxiv_primary_category' in entry else ''
            }
            papers.append(paper)

        return papers

    except Exception as e:
        print(f"Error: {str(e)}")
        return None

def main():
    # Keywords berdasarkan kategori CS yang populer
    keywords_by_category = {
        'Machine Learning & AI': [
            'deep learning neural networks',
            'reinforcement learning',
            'machine learning algorithms',
            'natural language processing',
            'computer vision recognition'
        ],
        'Data Science': [
            'big data analytics',
            'data mining techniques',
            'predictive modeling',
            'statistical learning'
        ],
        'Software Engineering': [
            'software development methodology',
            'agile development',
            'software testing',
            'code analysis'
        ],
        'Security': [
            'cybersecurity methods',
            'network security',
            'cryptography algorithms',
            'blockchain technology'
        ],
        'Systems': [
            'distributed systems',
            'cloud computing architecture',
            'operating systems',
            'embedded systems design'
        ],
        'Networks': [
            'computer networks protocols',
            'wireless networking',
            'network optimization',
            'internet protocols'
        ],
        'Database': [
            'database management systems',
            'query optimization',
            'data warehouse',
            'NoSQL databases'
        ],
        'Graphics & Vision': [
            '3D computer graphics',
            'image processing algorithms',
            'virtual reality systems',
            'augmented reality'
        ]
    }

    all_papers = []
    papers_per_keyword = 100  # Mengambil 25 paper per keyword

    for category, keywords in keywords_by_category.items():
        print(f"\nMengambil data untuk kategori: {category}")
        for keyword in keywords:
            # Mengambil data dalam satu batch
            papers = search_arxiv(keyword, start=0, max_results=papers_per_keyword)

            if papers:
                all_papers.extend(papers)
                print(f"Berhasil mengambil {len(papers)} paper untuk keyword '{keyword}'")
                time.sleep(3)  # Delay antara requests
            else:
                print(f"Tidak ada hasil untuk keyword '{keyword}'")

    if all_papers:
        # Hapus duplikat berdasarkan judul
        df = pd.DataFrame(all_papers)
        df = df.drop_duplicates(subset=['title'])

        # Tambah kolom kategori untuk memudahkan analisis
        df['timestamp'] = pd.to_datetime(df['published_date'])

        # Simpan dan tampilkan hasil
        filename = f'arxiv_cs_papers_{datetime.now().strftime("%Y%m%d_%H%M")}.csv'
        df.to_csv(filename, index=False)

        print("\nContoh data yang berhasil diambil:")
        display(df.head())

        print(f"\nTotal paper yang terkumpul (setelah menghapus duplikat): {len(df)}")

        # Tampilkan statistik kategori
        print("\nDistribusi kategori utama:")
        print(df['main_category'].value_counts().head(10))

        # Tampilkan statistik waktu publikasi
        print("\nDistribusi tahun publikasi:")
        print(df['timestamp'].dt.year.value_counts().sort_index().tail())

        return df
    else:
        print("Tidak ada data yang berhasil diambil")

# Jalankan program
if __name__ == "__main__":
    print("Memulai pengambilan data dari ArXiv...")
    df = main()

Memulai pengambilan data dari ArXiv...

Mengambil data untuk kategori: Machine Learning & AI
Mencari paper dengan keyword: deep learning neural networks (start: 0, max: 100)
Berhasil mengambil 100 paper untuk keyword 'deep learning neural networks'
Mencari paper dengan keyword: reinforcement learning (start: 0, max: 100)
Berhasil mengambil 100 paper untuk keyword 'reinforcement learning'
Mencari paper dengan keyword: machine learning algorithms (start: 0, max: 100)
Berhasil mengambil 100 paper untuk keyword 'machine learning algorithms'
Mencari paper dengan keyword: natural language processing (start: 0, max: 100)
Berhasil mengambil 100 paper untuk keyword 'natural language processing'
Mencari paper dengan keyword: computer vision recognition (start: 0, max: 100)
Berhasil mengambil 100 paper untuk keyword 'computer vision recognition'

Mengambil data untuk kategori: Data Science
Mencari paper dengan keyword: big data analytics (start: 0, max: 100)
Berhasil mengambil 100 paper untuk key

Unnamed: 0,title,authors,published_date,summary,link,categories,main_category,timestamp
0,Retrieving Semantics from the Deep: an RAG Sol...,"M. Hamza Mughal, Rishabh Dabral, Merel C. J. S...",2024-12-09T18:59:46Z,Non-verbal communication often comprises of se...,http://arxiv.org/abs/2412.06786v1,cs.CV,cs.CV,2024-12-09 18:59:46+00:00
1,P3-PO: Prescriptive Point Priors for Visuo-Spa...,"Mara Levy, Siddhant Haldar, Lerrel Pinto, Abhi...",2024-12-09T18:59:42Z,Developing generalizable robot policies that c...,http://arxiv.org/abs/2412.06784v1,"cs.RO, cs.AI, cs.CV, cs.LG",cs.RO,2024-12-09 18:59:42+00:00
2,CARP: Visuomotor Policy Learning via Coarse-to...,"Zhefei Gong, Pengxiang Ding, Shangke Lyu, Site...",2024-12-09T18:59:18Z,"In robotic visuomotor policy learning, diffusi...",http://arxiv.org/abs/2412.06782v1,"cs.RO, cs.CV",cs.RO,2024-12-09 18:59:18+00:00
3,Around the World in 80 Timesteps: A Generative...,"Nicolas Dufour, David Picard, Vicky Kalogeiton...",2024-12-09T18:59:04Z,Global visual geolocation predicts where an im...,http://arxiv.org/abs/2412.06781v1,"cs.CV, cs.LG",cs.CV,2024-12-09 18:59:04+00:00
4,Driv3R: Learning Dense 4D Reconstruction for A...,"Xin Fei, Wenzhao Zheng, Yueqi Duan, Wei Zhan, ...",2024-12-09T18:58:03Z,Realtime 4D reconstruction for dynamic scenes ...,http://arxiv.org/abs/2412.06777v1,"cs.CV, cs.AI, cs.LG",cs.CV,2024-12-09 18:58:03+00:00



Total paper yang terkumpul (setelah menghapus duplikat): 755

Distribusi kategori utama:
main_category
cs.CV             122
cs.LG              66
quant-ph           42
cs.CL              32
cs.CR              30
cs.DB              28
eess.SP            20
eess.IV            19
cs.AI              19
physics.optics     18
Name: count, dtype: int64

Distribusi tahun publikasi:
timestamp
2024    755
Name: count, dtype: int64


In [None]:
# Tampilkan jumlah paper per kategori utama
print(df['main_category'].value_counts())

# Filter untuk kategori tertentu
ml_papers = df[df['main_category'].str.contains('cs.AI', na=False)]

main_category
cs.CV              122
cs.LG               66
quant-ph            42
cs.CL               32
cs.CR               30
                  ... 
physics.acc-ph       1
physics.ao-ph        1
cs.CE                1
cs.DM                1
cond-mat.dis-nn      1
Name: count, Length: 99, dtype: int64


In [None]:
# Lihat trend publikasi per tahun
yearly_counts = df['timestamp'].dt.year.value_counts().sort_index()
display(yearly_counts)

Unnamed: 0_level_0,count
timestamp,Unnamed: 1_level_1
2024,755


In [None]:
!pip install nltk scikit-learn



In [None]:
!pip install nltk pandas numpy scikit-learn streamlit

Collecting streamlit
  Downloading streamlit-1.41.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.0-py2.py3-none-any.whl (23.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!ls *.csv

arxiv_cs_papers_20241211_0331.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from datetime import datetime

# Download nltk data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    """
    Membersihkan dan memproses teks
    """
    if isinstance(text, str):
        # Lowercase
        text = text.lower()

        # Hapus karakter khusus dan angka
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenisasi
        tokens = word_tokenize(text)

        # Hapus stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        return ' '.join(tokens)
    return ''

def create_feature_matrix(df):
    """
    Membuat matriks fitur dari data paper
    """
    # Pastikan kolom yang diperlukan ada
    required_columns = ['title', 'summary', 'categories']
    for col in required_columns:
        if col not in df.columns:
            df[col] = ''

    # Gabungkan fitur
    df['combined_features'] = df['title'] + ' ' + df['summary'] + ' ' + df['categories']

    # Preprocessing
    print("Melakukan preprocessing teks...")
    df['processed_features'] = df['combined_features'].apply(preprocess_text)

    # Buat TF-IDF matrix
    print("Membuat TF-IDF matrix...")
    tfidf = TfidfVectorizer(max_features=5000)  # Batasi jumlah fitur
    feature_matrix = tfidf.fit_transform(df['processed_features'])

    return feature_matrix, tfidf

def calculate_similarity_matrix(feature_matrix):
    """
    Menghitung matriks similarity antar paper
    """
    print("Menghitung similarity matrix...")
    return cosine_similarity(feature_matrix)

def get_recommendations(paper_idx, similarity_matrix, df, n_recommendations=5):
    """
    Mendapatkan rekomendasi paper
    """
    paper_similarities = similarity_matrix[paper_idx]
    similar_indices = paper_similarities.argsort()[::-1][1:n_recommendations+1]

    recommendations = pd.DataFrame({
        'title': df.iloc[similar_indices]['title'],
        'authors': df.iloc[similar_indices]['authors'],
        'similarity_score': paper_similarities[similar_indices],
        'link': df.iloc[similar_indices]['link']
    })

    return recommendations

def prepare_recommendation_system(filename):
    """
    Menyiapkan sistem rekomendasi
    """
    print(f"Membaca file: {filename}")
    try:
        # Baca file CSV
        df = pd.read_csv(filename)
        print(f"Berhasil membaca file dengan {len(df)} baris data")

        # Bersihkan data
        df = df.dropna(subset=['title'])
        print(f"Data setelah dibersihkan: {len(df)} baris")

        # Buat feature matrix
        feature_matrix, tfidf = create_feature_matrix(df)
        print(f"Feature matrix shape: {feature_matrix.shape}")

        # Hitung similarity matrix
        similarity_matrix = calculate_similarity_matrix(feature_matrix)
        print(f"Similarity matrix shape: {similarity_matrix.shape}")

        # Simpan hasil preprocessing
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
        processed_filename = f'processed_papers_{timestamp}.csv'
        similarity_filename = f'similarity_matrix_{timestamp}.npy'

        df.to_csv(processed_filename, index=False)
        np.save(similarity_filename, similarity_matrix)

        print(f"\nHasil preprocessing disimpan di:")
        print(f"- {processed_filename}")
        print(f"- {similarity_filename}")

        return df, feature_matrix, similarity_matrix, tfidf

    except FileNotFoundError:
        print(f"Error: File {filename} tidak ditemukan")
        print("Pastikan nama file sesuai dengan hasil scraping sebelumnya")
        return None, None, None, None
    except Exception as e:
        print(f"Error: {str(e)}")
        return None, None, None, None

# Jalankan sistem
def main():
    # Cari file CSV yang ada
    import glob
    csv_files = glob.glob('arxiv*.csv')

    if not csv_files:
        print("Tidak ada file CSV yang ditemukan")
        return None, None, None, None

    # Gunakan file terbaru
    latest_file = max(csv_files)
    print(f"Menggunakan file: {latest_file}")

    # Preprocessing
    df_processed, feature_matrix, similarity_matrix, tfidf = prepare_recommendation_system(latest_file)

    if df_processed is not None:
        # Tampilkan contoh rekomendasi
        print("\nContoh rekomendasi untuk paper pertama:")
        recommendations = get_recommendations(0, similarity_matrix, df_processed)
        display(recommendations)

        print("\nStatistik data:")
        print(f"Total paper: {len(df_processed)}")
        print(f"Jumlah fitur: {feature_matrix.shape[1]}")
        if 'main_category' in df_processed.columns:
            print("\nDistribusi kategori:")
            print(df_processed['main_category'].value_counts().head())

    return df_processed, feature_matrix, similarity_matrix, tfidf

if __name__ == "__main__":
    df_processed, feature_matrix, similarity_matrix, tfidf = main()

Menggunakan file: arxiv_cs_papers_20241211_0331.csv
Membaca file: arxiv_cs_papers_20241211_0331.csv
Berhasil membaca file dengan 755 baris data
Data setelah dibersihkan: 755 baris
Melakukan preprocessing teks...
Error: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_processed, feature_matrix, similarity_matrix, tfidf = main()

Menggunakan file: arxiv_cs_papers_20241211_0331.csv
Membaca file: arxiv_cs_papers_20241211_0331.csv
Berhasil membaca file dengan 755 baris data
Data setelah dibersihkan: 755 baris
Melakukan preprocessing teks...
Error: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************



In [None]:
# Untuk paper tertentu
if df_processed is not None:
    # Tampilkan judul paper yang tersedia
    print("Daftar 5 paper pertama:")
    for idx, title in enumerate(df_processed['title'][:5]):
        print(f"{idx}: {title}")

    # Pilih paper untuk mendapatkan rekomendasi
    paper_idx = 0  # Ganti dengan indeks yang diinginkan
    recommendations = get_recommendations(paper_idx, similarity_matrix, df_processed)
    display(recommendations)