In [None]:
# ==========================================
# Part 1: Data Loading and Basic Exploration
# ==========================================

import pandas as pd

# Load dataset
df = pd.read_csv("metadata.csv", low_memory=False)

# Look at first 5 rows
df.head()

# Shape of dataset
print("Rows, Columns:", df.shape)

# Info about datatypes
print(df.info())

# Missing values check
print(df.isnull().sum().head(20))

# ==========================================
# Part 2: Data Cleaning and Preparation
# ==========================================

# Convert publish_time to datetime
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")

# Extract year from publish_time
df["year"] = df["publish_time"].dt.year

# Add abstract word count
df["abstract_word_count"] = df["abstract"].fillna("").apply(lambda x: len(x.split()))

df[["title", "year", "abstract_word_count"]].head()

# ==========================================
# Part 3: Data Analysis and Visualization
# ==========================================

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import re

# --- Publications per year
year_counts = df["year"].value_counts().sort_index()
plt.figure(figsize=(8,5))
plt.bar(year_counts.index, year_counts.values)
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.show()

# --- Top journals
top_journals = df["journal"].value_counts().head(10)
print(top_journals)

top_journals.plot(kind="bar", figsize=(8,5), title="Top Journals")
plt.show()

# --- Frequent words in titles
titles = " ".join(df["title"].dropna().astype(str).tolist())
words = re.findall(r"\w+", titles.lower())
word_freq = Counter(words)

print("Most common words:", word_freq.most_common(20))

# --- Word cloud of titles
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(titles)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# ==========================================
# Part 4: Reflection
# ==========================================
"""
Reflection:
- The dataset contains thousands of research papers about COVID-19.
- I noticed many missing values in journal and abstract fields, which required cleaning.
- Publication counts peak in 2020, which makes sense since COVID-19 research surged then.
- The most common journals and frequent words in titles highlight the main research themes.
- I learned how to clean messy datasets, extract new features, and visualize patterns.
"""
