In [None]:
!pip install -q pymongo pandas numpy matplotlib seaborn

In [None]:
import pymongo
from pymongo import MongoClient
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mean, stdev

In [None]:
client = pymongo.MongoClient("mongodb+srv://DIY_db_user:P12hBh3LG22hUFLB@diy.37j37vy.mongodb.net/")
db = client.get_database('DS_DIY')
collections = db.paper
print('Collections in DB:', collections)

In [None]:
rows = []
tabular_data = []

main_doc = collections.find_one({})

if main_doc:
    print(f" Main document with {len(main_doc.keys()) - 1} papers")
    
    for arxiv_id, paper_data in main_doc.items():
        if arxiv_id == '_id':  
            continue

        paper_id = arxiv_id  
        # Title
        title = paper_data.get('title', '')
        title_length = len(title)
        title_length_without_space = len(''.join(title.split()))
        title_length_in_words = len(title.split())

        # Abstract (not in references.json, so will be empty)
        abstract = paper_data.get('abstract', '')
        abstract_length = len(abstract)
        abstract_length_without_space = len(''.join(abstract.split()))
        abstract_length_in_words = len(abstract.split())

        # Venue
        venue = paper_data.get('venue', '')
        venue_length = len(venue)
        venue_length_without_space = len(''.join(venue.split()))
        venue_length_in_words = len(venue.split())

        # Year
        year = paper_data.get('year')

        # Authors
        authors = paper_data.get('authors', [])
        author_count = len(authors) if authors else 0
        first_author_name = authors[0] if authors else ''

        doi = paper_data.get('doi', '')
        submission_date = paper_data.get('submission_date', '')
        revised_dates = paper_data.get('revised_dates', [])
        
        #chua co de lay
        corpus_id = ''
        reference_count = None
        citation_count = None
        influential_citation_count = None
        is_open_access = None
        fields_of_study = None
        s2_fields_of_study_count = 0
        publication_types_count = None
        publication_date = paper_data.get('submission_date', '')
        journal_name = paper_data.get('venue', '')
        
        embedding_dimension = None
        embedding_mean = None
        embedding_stddev = None

        row = {
            'id': paper_id,
            'corpusId': corpus_id,
            'title_length': title_length,
            'title_length_without_space': title_length_without_space,
            'title_length_in_words': title_length_in_words,
            'abstract_length': abstract_length,
            'abstract_length_without_space': abstract_length_without_space,
            'abstract_length_in_words': abstract_length_in_words,
            'venue_length': venue_length,
            'venue_length_without_space': venue_length_without_space,
            'venue_length_in_words': venue_length_in_words,
            'venue': venue,
            'year': year,
            'referenceCount': reference_count,
            'citationCount': citation_count,
            'influentialCitationCount': influential_citation_count,
            'isOpenAccess': is_open_access,
            'fieldsOfStudy': fields_of_study,
            's2FieldsOfStudy_count': s2_fields_of_study_count,
            'publicationTypes_count': publication_types_count,
            'publicationDate': publication_date,
            'journal_name': journal_name,
            'author_count': author_count,
            'first_author_name': first_author_name,
            'embedding_dimension': embedding_dimension,
            'embedding_mean': embedding_mean,
            'embedding_stddev': embedding_stddev,
        }
        tabular_data.append(row)

df = pd.DataFrame(tabular_data)
# Save to CSV
df.to_csv('paper.csv', index=False)
print(f"\nData saved to paper.csv")

In [None]:
print(f"Total documents in collection: {collections.count_documents({})}")

sample_docs = list(collections.find({}).limit(3))
for i, doc in enumerate(sample_docs):
    print(f"\n--- Document {i+1} ---")
    print("Keys in document:", list(doc.keys()))
    print("Document structure:")
    for key, value in doc.items():
        if isinstance(value, str) and len(value) > 100:
            print(f"  {key}: {type(value)} (length: {len(value)}) - {value[:100]}...")
        elif isinstance(value, list) and len(value) > 5:
            print(f"  {key}: {type(value)} (length: {len(value)}) - {value[:3]}...")
        else:
            print(f"  {key}: {type(value)} - {value}")
    print("-" * 50)

In [None]:
total_docs = collections.count_documents({})
print(f"Total documents: {total_docs}")

first_doc = collections.find_one({})
if first_doc:
    print(f"\nFirst document ID: {first_doc.get('_id')}")
    print("Available keys:", list(first_doc.keys())[:20], "..." if len(first_doc.keys()) > 20 else "")
    
    # Check for key fields expected to have
    key_fields = ['title', 'abstract', 'authors', 'venue', 'year', 'paperId', 'corpusId']
    print(f"\nChecking key fields:")
    for field in key_fields:
        value = first_doc.get(field)
        if value is not None:
            if isinstance(value, str):
                print(f"  {field}: '{value[:50]}{'...' if len(value) > 50 else ''}' (len: {len(value)})")
            else:
                print(f"  {field}: {type(value)} - {value}")
        else:
            print(f"  {field}: None/Missing")
    
    if 'arxiv_id' in first_doc:
        print(f"\nðŸŽ¯ Found arxiv_id: {first_doc['arxiv_id']} - This looks like references.json data!")
else:
    print("No documents found in collection")

In [None]:
df = pd.read_csv('paper.csv')
df.head()

In [None]:
print("Quick data quality assessment")
missing_counts = df.isnull().sum()
missing_frac = (df.isnull().sum() / len(df)).round(4)
completeness_df = pd.DataFrame({'missing_count': missing_counts, 'missing_fraction': missing_frac})
print("Missing values per column:")
print(completeness_df[completeness_df['missing_count'] > 0])

print(f"Total papers: {len(df)}")
print(f"Total columns: {len(df.columns)}")

print(f"\ntext length distribution")
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Title lengths
sns.histplot(df['title_length'].dropna(), bins=20, ax=axes[0,0])
axes[0,0].set_title('Title Length (characters)')
axes[0,0].set_xlabel('Characters')

sns.histplot(df['title_length_in_words'].dropna(), bins=20, ax=axes[0,1])
axes[0,1].set_title('Title Length (words)')
axes[0,1].set_xlabel('Words')

# Venue lengths  
sns.histplot(df['venue_length'].dropna(), bins=20, ax=axes[1,0])
axes[1,0].set_title('Venue Length (characters)')
axes[1,0].set_xlabel('Characters')

sns.histplot(df['venue_length_in_words'].dropna(), bins=20, ax=axes[1,1])
axes[1,1].set_title('Venue Length (words)')
axes[1,1].set_xlabel('Words')

plt.tight_layout()
plt.show()

print(f"\nyear distrubution")
year_counts = df['year'].value_counts(dropna=False).sort_index()
print("Papers by year:")
print(year_counts)

plt.figure(figsize=(10, 6))
year_counts.plot(kind='bar')
plt.title('Papers by Publication Year')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.xticks(rotation=45)
plt.show()