Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add standalone CPEx scraper #3622

Merged
merged 13 commits into from
Oct 8, 2023
22 changes: 22 additions & 0 deletions scrapers/cpex-scraper/.eslintrc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module.exports = {
"env": {
"browser": true,
"es2021": true
},
"extends": [
"eslint:recommended",
"plugin:@typescript-eslint/recommended"
],
"overrides": [
],
"parser": "@typescript-eslint/parser",
"parserOptions": {
"ecmaVersion": "latest",
"sourceType": "module"
},
"plugins": [
"@typescript-eslint"
],
"rules": {
}
}
4 changes: 4 additions & 0 deletions scrapers/cpex-scraper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
data/
mpeModules.json
cpexModules.json

22 changes: 22 additions & 0 deletions scrapers/cpex-scraper/ecosystem.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/* eslint-disable camelcase */

module.exports = {
apps: [
{
name: 'CPEx Scraper',
script: 'scripts/run.sh',

instances: 1,
// Can't get pm2 cron or system cron to work, and since the API is so unpredictably bad,
// we just restart the script every hour regardless of whether it is successful or not
autorestart: true,
restart_delay: 60 * 60 * 1000,

watch: false,

env: {
NODE_ENV: 'production',
},
},
],
};
1 change: 1 addition & 0 deletions scrapers/cpex-scraper/env.json
27 changes: 27 additions & 0 deletions scrapers/cpex-scraper/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "nusmods-cpex-scraper",
"version": "1.0.0",
"description": "NUSMods scraper for NUS Course Planning Exercise (CPEx)",
"author": "Christopher Goh <chris@nusmods.com>",
"license": "MIT",
"main": "src/index.ts",
"repository": "https://github.com/nusmodifications/nusmods",
"devDependencies": {
"@types/node": "^18.14.4",
"@typescript-eslint/eslint-plugin": "^5.54.0",
"@typescript-eslint/parser": "^5.54.0",
"eslint": "^8.35.0",
"prettier": "^2.8.4",
"typescript": "^4.9.5"
},
"scripts": {
"scrape": "node build/src/index.js",
"dev": "yarn build && node build/src/index.js",
"build": "tsc",
"lint": "tsc && yarn lint:code",
"lint:code": "eslint --ext .js,.ts src"
},
"dependencies": {
"axios": "^1.3.4"
}
}
25 changes: 25 additions & 0 deletions scrapers/cpex-scraper/scripts/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -e

# Print date for logs
date

# Ensure cwd is the root of the CPEx scraper project
cd "$(dirname "$0")"
cd ..

# Build the scraper
rm -rf build
yarn build

# Run the scraper
echo "Running CPEx scraper"
node build/src/index.js

# Sync with live data
echo "Syncing data"
\cp data/cpexModules.json ../../../api.nusmods.com/v2

# pm2 doesn't restart processes that have stopped, so this just noops until
# the next cron restart
echo "Finished syncing data. Sleeping."
199 changes: 199 additions & 0 deletions scrapers/cpex-scraper/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import axios from 'axios';
import fs from 'fs';
import path from 'path';

import env from '../env.json';

// Configure this!
const term = '2310';
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to set this to 2320


// Sanity check to see if there are at least this many modules before overwriting cpexModules.json
// The last time I ran this fully there were 3418 modules
const threshold = 1500;

const baseUrl = env['baseUrl'].endsWith('/') ? env['baseUrl'].slice(0, -1) : env['baseUrl'];

const FETCH_OK = '00000';

axios.defaults.headers.common = {
'X-STUDENT-API': env['studentKey'],
'X-APP-API': env['appKey'],
};

function getTimestampForFilename(): string {
function pad2(n: number): string {
return n < 10 ? '0' + n : String(n);
}

const date = new Date();

return (
date.getFullYear().toString() +
pad2(date.getMonth() + 1) +
pad2(date.getDate()) +
pad2(date.getHours()) +
pad2(date.getMinutes()) +
pad2(date.getSeconds())
);
}

type ApiResponse<T> = {
msg: string;
code: string;
ts: string;
data: T;
};

type GetDepartmentsResponseData = {
AcademicOrganisation: string;
Description: string;
};

// Set everything to optional because we cannot trust the API to be consistent
type Module = {
CourseTitle?: string;
ModularCredit?: string;
Subject?: string;
CatalogNumber?: string;
PrintCatalog?: string;
ModuleAttributes?: {
CourseAttribute: string;
CourseAttributeValue: string;
}[];
};

export type CPExModule = {
title: string;
moduleCode: string;
moduleCredit: string;
inS1CPEx?: boolean;
inS2CPEx?: boolean;
};

async function scraper() {
const getDepartmentsResponse = await axios.post<ApiResponse<GetDepartmentsResponseData[]>>(
`${baseUrl}/config/get-acadorg`,
{
eff_status: 'A',
acad_org: '%',
},
);
const departmentsData = getDepartmentsResponse.data.data;
console.log(`Total departments: ${departmentsData.length}`);

const collatedCPExModulesMap = new Map<string, CPExModule>();

for (let i = 0; i < departmentsData.length; i++) {
const department = departmentsData[i];

console.log(
`[${i + 1}/${departmentsData.length}] Fetching modules for ${
department.Description
} with acadorg: ${department.AcademicOrganisation}...`,
);

const getModulesResponse = await axios.post<ApiResponse<Module[]>>(`${baseUrl}/module`, {
acadorg: department.AcademicOrganisation,
term,
});

if (getModulesResponse.data.code !== FETCH_OK) {
console.log(
`Error fetching modules for ${department.Description} with acadorg: ${department.AcademicOrganisation}`,
);
continue;
}

const modulesData = getModulesResponse.data.data;

for (const module of modulesData) {
if (
!module.CourseTitle ||
!module.ModularCredit ||
!module.Subject ||
!module.CatalogNumber ||
!module.ModuleAttributes ||
!module.PrintCatalog
) {
continue;
}

// Filter out hidden modules
if (module.PrintCatalog !== 'Y') {
continue;
}

const moduleTitle = module.CourseTitle;
const moduleCode = `${module.Subject}${module.CatalogNumber}`;

// Filter duplicate modules
if (collatedCPExModulesMap.has(moduleCode)) {
continue;
}

const moduleCredit = module.ModularCredit;
const cpexAttribute = module.ModuleAttributes.find(
(attribute) => attribute.CourseAttribute === 'MPE', // this still isn't changed to CPEx
);

if (!cpexAttribute) {
continue;
}

const cpexModuleToAdd: CPExModule = {
title: moduleTitle,
moduleCode,
moduleCredit,
};

switch (cpexAttribute.CourseAttributeValue) {
case 'S1':
cpexModuleToAdd.inS1CPEx = true;
break;
case 'S2':
cpexModuleToAdd.inS2CPEx = true;
break;
case 'S1&S2':
cpexModuleToAdd.inS1CPEx = true;
cpexModuleToAdd.inS2CPEx = true;
break;
default:
console.log(
`Unknown CPEx attribute value: ${cpexAttribute.CourseAttributeValue} for ${moduleCode} ${moduleTitle}`,
);
break;
}
collatedCPExModulesMap.set(moduleCode, cpexModuleToAdd);
}
}

const collatedCPExModules = Array.from(collatedCPExModulesMap.values());
console.log(`Collated ${collatedCPExModules.length} modules.`);

const DATA_DIR = path.join(__dirname, '../../data');
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR);
}
const OLD_DATA_DIR = path.join(DATA_DIR, '/old');
if (!fs.existsSync(OLD_DATA_DIR)) {
fs.mkdirSync(OLD_DATA_DIR);
}

if (collatedCPExModules.length >= threshold) {
fs.writeFileSync(path.join(DATA_DIR, 'cpexModules.json'), JSON.stringify(collatedCPExModules));
console.log(`Wrote ${collatedCPExModules.length} modules to cpexModules.json.`);
} else {
console.log(
`Not writing to cpexModules.json because the number of modules ${collatedCPExModules.length} is less than the threshold of ${threshold}.`,
);
}

const archiveFilename = `cpexModules-${getTimestampForFilename()}.json`;
fs.writeFileSync(path.join(OLD_DATA_DIR, archiveFilename), JSON.stringify(collatedCPExModules));
console.log(`Wrote ${collatedCPExModules.length} modules to archive ${archiveFilename}.`);
console.log('Done!');
}

scraper().catch((error) => {
console.error(`Failed to scrape: ${error}`);
});
Loading