Skip to content
Permalink
Browse files

Merge pull request #1 from MichaelBone/master

Changed to include the state and postcode in addresses.
  • Loading branch information
MichaelBone committed May 12, 2019
2 parents 6edc8a0 + f1d85cb commit 2d51b9f5878e6b0d29acf61f846ae26c9577584e
Showing with 372 additions and 42 deletions.
  1. +23 −23 hundrednames.txt
  2. +1 −0 package.json
  3. +157 −10 scraper.js
  4. +187 −9 scraper.ts
  5. +3 −0 streetnames.txt
  6. +1 −0 suburbnames.txt
@@ -1,23 +1,23 @@
BOCKELBERG
CAMPBELL
CHANDADA
CUNGENA
FINLAYSON
FORREST
HASLAM
INKSTER
KALDOONERA
KARCULTABY
KOOLGERA
MURRAY
PERLUBIE
RIPON
ROUNSEVELL
SCOTT
TARLTON
WALLALA
WALPUPPIE
WITERA
WRENFORDSLEY
WRIGHT
YANTANABIE
BOCKELBERG,BOCKELBERG;LOCKES CLAYPAN
CAMPBELL,CHANDADA;INKSTER;MARYVALE;STREAKY BAY
CHANDADA,CHANDADA;CUNGENA;POOCHERA
CUNGENA,CUNGENA;KALDOONERA;YANTANABIE
FINLAYSON,HASLAM;PERLUBIE;PETINA
FORREST,MARYVALE;STREAKY BAY;YANERBIE
HASLAM,CARAWA;HASLAM;PETINA
INKSTER,CHANDADA;INKSTER;MOUNT COOPER
KALDOONERA,KALDOONERA
KARCULTABY,KARCULTABY;POOCHERA
KOOLGERA,KOOLGERA;PUREBA;WALLALA
MURRAY,CHANDADA;CHILPENUNDA;CUNGENA;PIEDNIPPIE
PERLUBIE,PETINA;PIMBAACLA;WIRRULLA
RIPON,STREAKY BAY;WESTALL;YANERBIE
ROUNSEVELL,COLLEY;MORTANA;TYRINGA;WITERA
SCOTT,EBA ANCHORAGE;PERLUBIE;PIEDNIPPIE;STREAKY BAY
TARLTON,CHILPENUNDA;CUNGENA;PETINA
WALLALA,PUREBA;WALLALA;WIRRULLA
WALPUPPIE,WALLALA;WIRRULLA;YANTANABIE
WITERA,COLLEY;INKSTER;MOUNT COOPER;WITERA
WRENFORDSLEY,BAIRD BAY;CALCA;MORTANA;SCEALE BAY;YANERBIE
WRIGHT,COLLEY;WITERA
YANTANABIE,KALDOONERA;KOOLGERA;PUREBA;WALLALA;YANTANABIE
@@ -6,6 +6,7 @@
"main": "scraper.js",
"dependencies": {
"cheerio": "1.0.0-rc.2",
"didyoumean2": "^2.0.4",
"fs": "0.0.1-security",
"moment": "2.24.0",
"pdfjs-dist": "2.0.943",

Large diffs are not rendered by default.

@@ -13,6 +13,7 @@ import * as sqlite3 from "sqlite3";
import * as urlparser from "url";
import * as moment from "moment";
import * as pdfjs from "pdfjs-dist";
import didYouMean, * as didyoumean from "didyoumean2";

sqlite3.verbose();

@@ -44,7 +45,7 @@ async function initializeDatabase() {

async function insertRow(database, developmentApplication) {
return new Promise((resolve, reject) => {
let sqlStatement = database.prepare("insert or ignore into [data] values (?, ?, ?, ?, ?, ?, ?)");
let sqlStatement = database.prepare("insert or replace into [data] values (?, ?, ?, ?, ?, ?, ?)");
sqlStatement.run([
developmentApplication.applicationNumber,
developmentApplication.address,
@@ -58,10 +59,7 @@ async function insertRow(database, developmentApplication) {
console.error(error);
reject(error);
} else {
if (this.changes > 0)
console.log(` Inserted: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" into the database.`);
else
console.log(` Skipped: application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" because it was already present in the database.`);
console.log(` Saved application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" to the database.`);
sqlStatement.finalize(); // releases any locks
resolve(row);
}
@@ -184,6 +182,7 @@ function parseOldFormatApplicationElements(elements: Element[], informationUrl:
height: propertyAddressHeadingElement.height
};
let address = elements.filter(element => getPercentageOfElementInRectangle(element, addressBounds) > 10).map(element => element.text).join(" ").trim().replace(/\s\s+/g, " ");
address = formatAddress(applicationNumber, address);

if (address === "") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
@@ -288,6 +287,7 @@ function parseNewFormatApplicationElements(elements: Element[], informationUrl:
height: (developmentDescriptionHeadingElement === undefined) ? 2 * assessmentNumberHeadingElement.height : (developmentDescriptionHeadingElement.y - (assessmentNumberHeadingElement.y + assessmentNumberHeadingElement.height))
};
let address = elements.filter(element => getPercentageOfElementInRectangle(element, addressBounds) > 10).map(element => element.text).join(" ").trim().replace(/\s\s+/g, " ");
address = formatAddress(applicationNumber, address);

if (address === "") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
@@ -319,6 +319,182 @@ function parseNewFormatApplicationElements(elements: Element[], informationUrl:
}
}

// Formats (and corrects) an address.

function formatAddress(applicationNumber: string, address: string) {
address = address.trim().replace(/[-–]+$/, "").replace(/\s\s+/g, " ").trim(); // remove trailing dashes and multiple white space characters
if (address.replace(/[\s,0-]/g, "") === "" || address.startsWith("No Residential Address")) // ignores addresses such as "0 0, 0" and "-"
return "";

// Remove the comma in house numbers larger than 1000. For example, the following addresses:
//
// 4,665 Princes HWY MENINGIE 5264
// 11,287 Princes HWY SALT CREEK 5264
//
// would be converted to the following:
//
// 4665 Princes HWY MENINGIE 5264
// 11287 Princes HWY SALT CREEK 5264

if (/^\d,\d\d\d/.test(address))
address = address.substring(0, 1) + address.substring(2);
else if (/^\d\d,\d\d\d/.test(address))
address = address.substring(0, 2) + address.substring(3);

let tokens = address.split(" ");

let postCode = undefined;
let token = tokens.pop();
if (token === undefined)
return address;
if (/^\d\d\d\d$/.test(token))
postCode = token;
else
tokens.push(token);

// Ensure that a state code is added before the post code if a state code is not present.

let state = "SA";
token = tokens.pop();
if (token === undefined)
return address;
if ([ "ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA" ].includes(token.toUpperCase()))
state = token.toUpperCase();
else
tokens.push(token);

// Construct a fallback address to be used if the suburb name cannot be determined later.

let fallbackAddress = (postCode === undefined) ? address : [ ...tokens, state, postCode].join(" ").trim();

// Pop tokens from the end of the array until a valid suburb name is encountered (allowing
// for a few spelling errors). Note that this starts by examining for longer matches
// (consisting of four tokens) before examining shorter matches. This approach ensures
// that the following address:
//
// 2,800 Woods Well RD COLEBATCH 5266
//
// is correctly converted to the following address:
//
// 2800 WOODS WELL ROAD, COLEBATCH SA 5266
//
// rather than (incorrectly) to the following address (notice that the street name has "BELL"
// instead of "WELL" because there actually is a street named "BELL ROAD").
//
// 2800 Woods BELL ROAD, COLEBATCH SA 5266
//
// This also allows for addresses that contain hundred names such as the following:
//
// Sec 26 Hd Palabie
// Lot no 1, Standley Road, Sect 16, Hundred of Pygery

let suburbName = undefined;
let hasHundredName = false;

for (let index = 4; index >= 1; index--) {
let tryHundredName = tokens.slice(-index).join(" ").toUpperCase();
if (tryHundredName.startsWith("HD OF ") || tryHundredName.startsWith("HUNDRED OF") || tryHundredName.startsWith("HD ") || tryHundredName.startsWith("HUNDRED ")) {
tryHundredName = tryHundredName.replace(/^HD OF /, "").replace(/^HUNDRED OF /, "").replace(/^HD /, "").replace(/^HUNDRED /, "").trim();
let hundredNameMatch = <string>didYouMean(tryHundredName, Object.keys(HundredNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (hundredNameMatch !== null) {
hasHundredName = true;
let suburbNames = HundredNames[hundredNameMatch];
if (suburbNames.length === 1) { // if a unique suburb exists for the hundred then use that suburb
suburbName = SuburbNames[suburbNames[0]];
tokens.splice(-index, index); // remove elements from the end of the array
}
break;
}
}
}

// Only search for a suburb name if there was no hundred name (because a suburb name is
// unlikely to appear before a hundred name).

if (!hasHundredName) {
for (let index = 4; index >= 1; index--) {
let trySuburbName = tokens.slice(-index).join(" ");
let suburbNameMatch = <string>didYouMean(trySuburbName, Object.keys(SuburbNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (suburbNameMatch !== null) {
suburbName = SuburbNames[suburbNameMatch];
tokens.splice(-index, index); // remove elements from the end of the array
break;
}
}
}

// Expand any street suffix (for example, this converts "ST" to "STREET").

token = tokens.pop();
if (token !== undefined) {
token = token.trim().replace(/,+$/, "").trim(); // removes trailing commas
let streetSuffix = StreetSuffixes[token.toUpperCase()];
if (streetSuffix === undefined)
streetSuffix = Object.values(StreetSuffixes).find(streetSuffix => streetSuffix === token.toUpperCase()); // the street suffix is already expanded
if (streetSuffix === undefined)
tokens.push(token); // unrecognised street suffix
else
tokens.push(streetSuffix); // add back the expanded street suffix
}

// Pop tokens from the end of the array until a valid street name is encountered (allowing
// for a few spelling errors). Similar to the examination of suburb names, this examines
// longer matches before examining shorter matches (for the same reason).

let streetName = undefined;
for (let index = 5; index >= 1; index--) {
let tryStreetName = tokens.slice(-index).join(" ").trim().replace(/,+$/, "").trim(); // allows for commas after the street name
let streetNameMatch = <string>didYouMean(tryStreetName, Object.keys(StreetNames), { caseSensitive: false, returnType: didyoumean.ReturnTypeEnums.FIRST_CLOSEST_MATCH, thresholdType: didyoumean.ThresholdTypeEnums.EDIT_DISTANCE, threshold: 1, trimSpaces: true });
if (streetNameMatch !== null) {
streetName = streetNameMatch;
let suburbNames = StreetNames[streetNameMatch];
tokens.splice(-index, index); // remove elements from the end of the array

// If the suburb was not determined earlier then attempt to obtain the suburb based
// on the street (ie. if there is only one suburb associated with the street). For
// example, this would automatically add the suburb to "22 Jefferson CT 5263",
// producing the address "22 JEFFERSON COURT, WELLINGTON EAST SA 5263".

if (suburbName === undefined && suburbNames.length === 1)
suburbName = SuburbNames[suburbNames[0]];

break;
}
}

// If a post code was included in the original address then use it to override the post code
// included in the suburb name (because the post code in the original address is more likely
// to be correct).

if (postCode !== undefined && suburbName !== undefined)
suburbName = suburbName.replace(/\s+\d\d\d\d$/, " " + postCode);

// Do not allow an address that does not have a suburb name.

if (suburbName === undefined) {
console.log(`Ignoring the development application "${applicationNumber}" because a suburb name could not be determined for the address: ${address}`);
return "";
}

// Reconstruct the address with a comma between the street address and the suburb.

if (suburbName === undefined || suburbName.trim() === "")
address = fallbackAddress;
else {
if (streetName !== undefined && streetName.trim() !== "")
tokens.push(streetName);
let streetAddress = tokens.join(" ").trim().replace(/,+$/, "").trim(); // removes trailing commas
address = streetAddress + (streetAddress === "" ? "" : ", ") + suburbName;
}

// Ensure that the address includes the state "SA".

if (address !== "" && !/\bSA\b/g.test(address))
address += " SA";

return address;
}

// Parses the development applications in the specified date range.

async function parsePdf(url: string) {
@@ -427,9 +603,11 @@ async function main() {
SuburbNames[suburbTokens[0].trim()] = suburbTokens[1].trim();
}

HundredNames = [];
for (let line of fs.readFileSync("hundrednames.txt").toString().replace(/\r/g, "").trim().split("\n"))
HundredNames.push(line.trim().toUpperCase());
HundredNames = {};
for (let line of fs.readFileSync("hundrednames.txt").toString().replace(/\r/g, "").trim().split("\n")) {
let hundredNameTokens = line.toUpperCase().split(",");
HundredNames[hundredNameTokens[0].trim()] = hundredNameTokens[1].trim().split(";");
}

// Read the main page that has links to each year of development applications.

@@ -525,7 +703,7 @@ async function main() {
if (global.gc)
global.gc();

console.log(`Inserting development applications into the database.`);
console.log(`Saving development applications to the database.`);
for (let developmentApplication of developmentApplications)
await insertRow(database, developmentApplication);
}
@@ -223,6 +223,9 @@ MORONEY ROAD,CHANDADA
MOUNT COOPER ROAD,COLLEY
MOUNT HALL ROAD,MORTANA
MOUNT MARIA DRIVE,STREAKY BAY
MT COOPER ROAD,COLLEY
MT HALL ROAD,MORTANA
MT MARIA DRIVE,STREAKY BAY
MUDGE ROAD,CHANDADA
MUDGE ROAD,STREAKY BAY
MUDGE TERRACE,STREAKY BAY
@@ -15,6 +15,7 @@ KOOLGERA,KOOLGERA SA 5661
MARYVALE,MARYVALE SA 5680
MORTANA,MORTANA SA 5671
MOUNT COOPER,MOUNT COOPER SA 5671
MT COOPER,MOUNT COOPER SA 5671
PERLUBIE,PERLUBIE SA 5680
PETINA,PETINA SA 5680
PIEDNIPPIE,PIEDNIPPIE SA 5680

0 comments on commit 2d51b9f

Please sign in to comment.
You can’t perform that action at this time.