In [None]:

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CORD-19 Analysis Notebook\n",
"\n",
"This notebook walks through the assignment step-by-step: loading `metadata.csv`, cleaning, analysis, visualizations. Save outputs to the `outputs/` folder."
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 1. Imports\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from collections import Counter\n",
"import os\n",
"from wordcloud import WordCloud\n",
"%matplotlib inline\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 2. Load data (adjust path if needed). Use nrows for quick testing.\n",
"path = '../data/metadata.csv' # if you run this notebook from notebooks/ folder, adjust path accordingly\n",
"# For quick tests, set nrows=50000; set nrows=None to load full file\n",
"nrows = None\n",
"df = pd.read_csv(path, nrows=nrows, low_memory=False)\n",
"print('Loaded rows:', len(df))\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 3. Basic exploration\n",
"print('Shape:', df.shape)\n",
"print('\nColumn dtypes:')\n",
"print(df.dtypes)\n",
"print('\nMissing values (top 40):')\n",
"print(df.isnull().sum().sort_values(ascending=False).head(40))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 4. Cleaning decisions\n",
"# Columns with >80% missing will be dropped by default (change threshold as you like)\n",
"missing_frac = df.isnull().mean().sort_values(ascending=False)\n",
"drop_cols = missing_frac[missing_frac > 0.8].index.tolist()\n",
"print('Dropping columns (>80% missing):', drop_cols)\n",
"df_clean = df.drop(columns=drop_cols).copy()\n",
"# Drop rows without title (we need titles for text analysis)\n",
"df_clean = df_clean.dropna(subset=['title']).copy()\n",
"# Parse publish_time\n",
"if 'publish_time' in df_clean.columns:\n",
" df_clean['publish_time_parsed'] = pd.to_datetime(df_clean['publish_time'], errors='coerce', infer_datetime_format=True)\n",
" df_clean['year'] = df_clean['publish_time_parsed'].dt.year\n",
"else:\n",
" df_clean['year'] = np.nan\n",
"# Abstract word count\n",
"df_clean['abstract'] = df_clean['abstract'].fillna('')\n",
"df_clean['abstract_word_count'] = df_clean['abstract'].str.split().apply(len)\n",
"print('After cleaning shape:', df_clean.shape)\n",
"df_clean.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 5. Analysis: papers by year\n",
"os.makedirs('../outputs', exist_ok=True)\n",
"if df_clean['year'].notna().any():\n",
" year_counts = df_clean['year'].value_counts().sort_index()\n",
" print(year_counts.head(20))\n",
" # Plot and save\n",
" fig, ax = plt.subplots(figsize=(10,4))\n",
" year_counts.plot(kind='bar', ax=ax)\n",
" ax.set_xlabel('Year'); ax.set_ylabel('Count'); ax.set_title('Publications by Year')\n",
" fig.tight_layout()\n",
" fig.savefig('../outputs/publications_by_year.png')\n",
" display(fig)\n",
"else:\n",
" print('No publish_time information available to compute year counts.')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Top journals\n",
"if 'journal' in df_clean.columns:\n",
" top_journals = df_clean['journal'].fillna('Unknown').value_counts().head(30)\n",
" print(top_journals.head(20))\n",
" fig2, ax2 = plt.subplots(figsize=(8,6))\n",
" top_journals.sort_values().plot(kind='barh', ax=ax2)\n",
" ax2.set_title('Top 30 journals (paper count)')\n",
" fig2.tight_layout()\n",
" fig2.savefig('../outputs/top_journals.png')\n",
" display(fig2)\n",
"else:\n",
" print('Column journal not found')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Most frequent words in titles (simple frequency)\n",
"titles = df_clean['title'].astype(str).str.lower().str.replace(r'[^a-z0-9\\s]', ' ', regex=True)\n",
"words = titles.str.split().explode()\n",
"stopwords = set(['the','and','of','to','in','a','for','on','with','by','from','is','this','that','covid','covid-19','sars','coronavirus'])\n",]
"words = words[~words.isin(stopwords)]}