diff --git a/.gitignore b/.gitignore index 9a3dd8b..dff3c71 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,6 @@ -ZIP_CODES.txt -zip_codes.zip -zip_dump.csv -zip_codes.db /.bundle /free_zipcode_data.sqlite3 /data /build +/stubs +/vendor/bundle/ diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..c99d2e7 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/.rubocop.yml b/.rubocop.yml index f0322f5..9f11713 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,4 +1,6 @@ AllCops: + TargetRubyVersion: 2.3 + # Include gemspec and Rakefile Include: - '**/*.gemspec' @@ -15,6 +17,7 @@ AllCops: Exclude: - 'vendor/**/*' - 'stubs/**/*' + - 'spec/support/shared_contexts/*' # Checks formatting of special comments CommentAnnotation: @@ -49,32 +52,16 @@ Style/RaiseArgs: Style/DoubleNegation: Enabled: false -Style/SignalException: - EnforcedStyle: semantic - -Style/ClassAndModuleChildren: - Enabled: false - -Style/MultilineMethodCallIndentation: - EnforcedStyle: indented - IndentationWidth: 4 - -Style/CaseIndentation: - Enabled: false - -Style/TrivialAccessors: - Enabled: false - -Style/NumericLiterals: +Style/PerlBackrefs: Enabled: false ######################################## # Lint Cops Lint/Eval: - Enabled: true + Enabled: false -Lint/AssignmentInCondition: +Lint/HandleExceptions: Enabled: false ######################################## @@ -85,7 +72,14 @@ Metrics/LineLength: Metrics/MethodLength: CountComments: false # count full line comments? - Max: 20 + Max: 30 Metrics/ClassLength: Max: 120 + +Metrics/AbcSize: + Enabled: false + +# rubocop:disable Metrics/AbcSize +# rubocop:disable Metrics/MethodLength +# rubocop:disable Metrics/BlockLength diff --git a/Gemfile b/Gemfile index 833365f..e967149 100644 --- a/Gemfile +++ b/Gemfile @@ -1,7 +1,6 @@ -source 'http://rubygems.org' +# frozen_string_literal: true -gem 'midwire_common' -gem 'pry-nav' -gem 'rake' -gem 'rubyzip', '~> 1.2.1' -gem 'sqlite3' +source 'https://rubygems.org' +git_source(:github) { |repo| "https://github.com/#{repo}.git" } + +gemspec diff --git a/Gemfile.lock b/Gemfile.lock index a94bc5e..04c225c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,31 +1,79 @@ +PATH + remote: . + specs: + free_zipcode_data (1.0.0) + colored (~> 1.2) + kiba (~> 2.0) + rubyzip (~> 1.2) + sqlite3 (~> 1.3) + trollop (~> 2.1) + GEM - remote: http://rubygems.org/ + remote: https://rubygems.org/ specs: + ast (2.4.0) coderay (1.1.2) + colored (1.2) + diff-lcs (1.3) + docile (1.3.0) + json (2.1.0) + kiba (2.0.0) method_source (0.8.2) - midwire_common (0.1.16) - thor (~> 0.19) + parallel (1.12.1) + parser (2.5.1.0) + ast (~> 2.4.0) + powerpack (0.1.1) pry (0.10.4) coderay (~> 1.1.0) method_source (~> 0.8.1) slop (~> 3.4) pry-nav (0.2.4) pry (>= 0.9.10, < 0.11.0) + rainbow (3.0.0) rake (12.2.1) + rspec (3.7.0) + rspec-core (~> 3.7.0) + rspec-expectations (~> 3.7.0) + rspec-mocks (~> 3.7.0) + rspec-core (3.7.1) + rspec-support (~> 3.7.0) + rspec-expectations (3.7.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.7.0) + rspec-mocks (3.7.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.7.0) + rspec-support (3.7.1) + rubocop (0.55.0) + parallel (~> 1.10) + parser (>= 2.5) + powerpack (~> 0.1) + rainbow (>= 2.2.2, < 4.0) + ruby-progressbar (~> 1.7) + unicode-display_width (~> 1.0, >= 1.0.1) + ruby-progressbar (1.9.0) rubyzip (1.2.1) + simplecov (0.16.1) + docile (~> 1.1) + json (>= 1.8, < 3) + simplecov-html (~> 0.10.0) + simplecov-html (0.10.2) slop (3.6.0) - sqlite3 (1.3.11) - thor (0.19.1) + sqlite3 (1.3.13) + trollop (2.1.2) + unicode-display_width (1.3.2) PLATFORMS ruby DEPENDENCIES - midwire_common - pry-nav - rake - rubyzip (~> 1.2.1) - sqlite3 + bundler (~> 1.16) + free_zipcode_data! + pry-nav (~> 0.2) + rake (~> 12.0) + rspec (~> 3.7) + rubocop + simplecov BUNDLED WITH - 1.15.4 + 1.16.1 diff --git a/README.md b/README.md index 7f127e3..8b40f17 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,10 @@ create table zipcodes ( Both `lat` and `lon`, geocodes, are populated for each zipcode record. +## Development + +If you want to run the specs or do development work, set `GEM_ENV='development'` + ## Data License The zipcode data is licensed under a Creative Commons Attribution 3.0 Unported License, carried forward from [GeoNames](http://www.geonames.org).
diff --git a/bin/free_zipcode_data b/bin/free_zipcode_data new file mode 100755 index 0000000..40e067d --- /dev/null +++ b/bin/free_zipcode_data @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +require 'rubygems' +require 'bundler/setup' + +require 'free_zipcode_data' +require 'free_zipcode_data/runner' + +FreeZipcodeData::Runner.instance.start diff --git a/country_lookup_table.yml b/country_lookup_table.yml index b4f10d0..18d9c84 100644 --- a/country_lookup_table.yml +++ b/country_lookup_table.yml @@ -10,7 +10,7 @@ AX: AL: :name: Albania :alpha3: ALB - :iso: 008 + :iso: '008' DZ: :name: Algeria :alpha3: DZA diff --git a/free_zipcode_data.gemspec b/free_zipcode_data.gemspec new file mode 100644 index 0000000..9fcc5fa --- /dev/null +++ b/free_zipcode_data.gemspec @@ -0,0 +1,36 @@ +# frozen_string_literal: true +# coding: utf-8 + +lib = File.expand_path('../lib', __FILE__) +$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) +require 'free_zipcode_data/version' + +Gem::Specification.new do |spec| + spec.name = 'free_zipcode_data' + spec.version = FreeZipcodeData::VERSION + spec.authors = ['Chris Blackburn', 'Chris McKnight'] + spec.email = ['87a1779b@opayq.com'] + spec.summary = 'Free US postal codes in CSV and SQLite3 format.' + spec.description = spec.summary + spec.homepage = 'https://github.com/midwire/free_zipcode_data' + spec.license = 'MIT' + + spec.required_ruby_version = '>= 2.3.0' + spec.files = `git ls-files -z`.split("\x0") + spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } + spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) + spec.require_paths = ['lib'] + + spec.add_development_dependency 'bundler', '~> 1.16' + spec.add_development_dependency 'pry-nav', '~> 0.2' + spec.add_development_dependency 'rake', '~> 12.0' + spec.add_development_dependency 'rspec', '~> 3.7' + spec.add_development_dependency 'rubocop' + spec.add_development_dependency 'simplecov' + + spec.add_runtime_dependency 'colored', '~> 1.2' + spec.add_runtime_dependency 'kiba', '~> 2.0' + spec.add_runtime_dependency 'rubyzip', '~> 1.2' + spec.add_runtime_dependency 'sqlite3', '~> 1.3' + spec.add_runtime_dependency 'trollop', '~> 2.1' +end diff --git a/lib/etl/common.rb b/lib/etl/common.rb new file mode 100644 index 0000000..b1a1a7d --- /dev/null +++ b/lib/etl/common.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +require_relative 'csv_source' +require_relative '../free_zipcode_data/country_table' +require_relative '../free_zipcode_data/state_table' +require_relative '../free_zipcode_data/county_table' +require_relative '../free_zipcode_data/zipcode_table' + +def show_me + transform do |row| + ap row + row + end +end + +def limit(count) + count = Integer(count || -1) + return if count == -1 + transform do |row| + @counter ||= 0 + @counter += 1 + @counter > count ? nil : row + end +end diff --git a/lib/etl/csv_source.rb b/lib/etl/csv_source.rb new file mode 100644 index 0000000..0cbd162 --- /dev/null +++ b/lib/etl/csv_source.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require 'csv' + +class CsvSource + attr_reader :filename, :delimeter, :quote_char, :headers + + def initialize(filename:, headers: true, delimeter: "\t", quote_char: '"') + @filename = filename + @headers = headers + @delimeter = delimeter + @quote_char = quote_char + end + + def each + CSV.open(filename, + col_sep: delimeter, + headers: headers, + header_converters: :symbol, + quote_char: quote_char) do |csv| + csv.each do |row| + yield(row.to_hash) + end + end + end +end diff --git a/lib/etl/free_zipcode_data_job.rb b/lib/etl/free_zipcode_data_job.rb new file mode 100644 index 0000000..d691ee0 --- /dev/null +++ b/lib/etl/free_zipcode_data_job.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +require_relative 'common' + +module ETL + module FreeZipcodeDataJob + module_function + + def setup(country_file, database, logger, options) + Kiba.parse do + pre_process do + logger.info("Processing '#{country_file}' data, please be patient...") + end + + source CsvSource, filename: country_file, quote_char: '"', delimeter: ',' + + destination FreeZipcodeData::CountryTable, + database: database, + tablename: options[:country_tablename] + + destination FreeZipcodeData::StateTable, + database: database, + tablename: options[:state_tablename] + + destination FreeZipcodeData::CountyTable, + database: database, + tablename: options[:county_tablename] + + destination FreeZipcodeData::ZipcodeTable, + database: database, + tablename: options[:zipcode_tablename] + + end + end + end +end diff --git a/lib/free_zipcode_data.rb b/lib/free_zipcode_data.rb new file mode 100644 index 0000000..24c8074 --- /dev/null +++ b/lib/free_zipcode_data.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require 'readline' + +require 'free_zipcode_data/version' + +module FreeZipcodeData + def self.root + Pathname.new(File.dirname(__FILE__)).parent + end + + def self.current_environment + ENV.fetch('APP_ENV', 'development') + end + + #:nocov: + def self.config_file(filename = '.free_zipcode_data.yml') + return root.join('spec', 'fixtures', filename) if current_environment == 'test' + home = ENV.fetch('HOME') + file = ENV.fetch('FZD_CONFIG_FILE', File.join(home, '.free_zipcode_data.yml')) + FileUtils.touch(file) + file + end + #:nocov: + + def self.os + if RUBY_PLATFORM.match?(/cygwin|mswin|mingw|bccwin|wince|emx/) + :retarded + else + :normal + end + end + + autoload :CountryTable, 'free_zipcode_data/country_table' + autoload :StateTable, 'free_zipcode_data/state_table' + autoload :CountyTable, 'free_zipcode_data/county_table' + autoload :ZipcodeTable, 'free_zipcode_data/zipcode_table' + autoload :DataSource, 'free_zipcode_data/data_source' + autoload :Logger, 'free_zipcode_data/logger' + autoload :Options, 'free_zipcode_data/options' + autoload :Settings, 'free_zipcode_data/settings' + autoload :SqliteRam, 'free_zipcode_data/sqlite_ram' +end diff --git a/lib/free_zipcode_data/country_table.rb b/lib/free_zipcode_data/country_table.rb new file mode 100644 index 0000000..4b27bcf --- /dev/null +++ b/lib/free_zipcode_data/country_table.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require_relative 'db_table' + +module FreeZipcodeData + class CountryTable < DbTable + def build + schema = <<-SQL + create table #{tablename} ( + id integer not null primary key, + alpha2 varchar(2) not null, + alpha3 varchar(3), + iso varchar(3), + name varchar(255) not null + ) + SQL + database.execute_batch(schema) + + ndx = <<-SQL + CREATE UNIQUE INDEX "main"."unique_country_alpha2" + ON #{tablename} (alpha2 COLLATE NOCASE ASC); + SQL + database.execute_batch(ndx) + end + + def write(row) + country_hash = country_lookup_table[row[:country]] + + sql = <<-SQL + INSERT INTO countries (alpha2, alpha3, iso, name) + VALUES ('#{row[:country]}', + '#{country_hash[:alpha3]}', + '#{country_hash[:iso]}', + '#{country_hash[:name]}') + SQL + + begin + database.execute(sql) + rescue SQLite3::ConstraintException + # Swallow duplicates + end + end + end +end diff --git a/lib/free_zipcode_data/county_table.rb b/lib/free_zipcode_data/county_table.rb new file mode 100644 index 0000000..6b1429b --- /dev/null +++ b/lib/free_zipcode_data/county_table.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +require_relative 'db_table' + +module FreeZipcodeData + class CountyTable < DbTable + def build + schema = <<-SQL + create table #{tablename} ( + id integer not null primary key, + state_id integer, + abbr varchar(255), + name varchar(255), + county_seat varchar(255) + ) + SQL + database.execute_batch(schema) + + ndx = <<-SQL + CREATE UNIQUE INDEX "main"."unique_county" + ON #{tablename} (state_id, abbr, name COLLATE NOCASE ASC); + SQL + database.execute_batch(ndx) + end + + def write(row) + return nil unless row[:county] + state_id = get_state_id(row[:short_state]) + raise "Could not find state: #{row[:short_state]}" unless state_id + sql = <<-SQL + INSERT INTO counties (state_id, abbr, name) + VALUES ('#{state_id}', + '#{row[:short_county]}', + '#{escape_single_quotes(row[:county])}' + ) + SQL + + begin + database.execute(sql) + rescue SQLite3::ConstraintException + # swallow duplicates + rescue StandardError => err + raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]" + end + end + end +end diff --git a/lib/free_zipcode_data/data_source.rb b/lib/free_zipcode_data/data_source.rb new file mode 100644 index 0000000..41cfdbd --- /dev/null +++ b/lib/free_zipcode_data/data_source.rb @@ -0,0 +1,88 @@ +# frozen_string_literal: true + +require 'csv' +require 'open-uri' +require 'zip' + +module FreeZipcodeData + class DataSource + BASE_URL = 'http://download.geonames.org/export/zip' + + attr_reader :country, :options + + def initialize(country = nil) + @country = country + @options = Options.instance.hash + @logger = Logger.instance + end + + def download + return nil if !options.clobber && File.exist?(zipfile_path) + FileUtils.mkdir_p(options.work_dir) + @logger.info("Downloading: #{zipfile} from GeoNames...") + open(zipfile_path, 'wb') do |file| + file << open("#{BASE_URL}/#{zipfile}").read + end + end + + def datafile + @datafile ||= begin + datafile_with_headers + end + end + + private + + def zipfile + @zipfile ||= begin + filename = country.nil? ? 'allCountries' : country.upcase + filename += '.zip' unless filename =~ /\.zip$/ + filename + end + end + + def zipfile_path + @zipfile_path ||= File.join(options.work_dir, zipfile) + end + + def unzipped_datafile + @unzipped_datafile ||= begin + country_file = nil + Zip::File.open(zipfile_path) do |zip| + zip.each do |entry| + next if entry.name =~ /readme/i + country_file = File.join(options.work_dir, entry.name) + if File.exist?(country_file) + if options[:clobber] + Zip.on_exists_proc = true + Logger.instance.verbose("Extracting: #{zipfile}...") + entry.extract(country_file) + end + else + Logger.instance.verbose("Extracting: #{zipfile}...") + entry.extract(country_file) + end + break + end + end + country_file + end + end + + def datafile_with_headers + filename = "#{unzipped_datafile}.csv" + if File.exist?(filename) && !options[:clobber] + @logger.verbose("File: #{filename} already exists, skipping...") + return filename + end + @logger.verbose("Preparing: #{filename} for processing...") + CSV.open(filename, 'w') do |outfile| + outfile << %w[COUNTRY POSTAL_CODE CITY STATE SHORT_STATE COUNTY SHORT_COUNTY COMMUNITY SHORT_COMMUNITY LATITUDE LONGITUDE ACCURACY] + CSV.foreach(unzipped_datafile, headers: false, col_sep: "\t", quote_char: '|') do |row| + outfile << row + end + end + filename + end + end +end diff --git a/lib/free_zipcode_data/db_table.rb b/lib/free_zipcode_data/db_table.rb new file mode 100644 index 0000000..559cba5 --- /dev/null +++ b/lib/free_zipcode_data/db_table.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +require 'yaml' + +module FreeZipcodeData + class DbTable + ISSUE_URL = 'https://github.com/midwire/free_zipcode_data/issues/new' + + attr_reader :database, :tablename + + def initialize(database:, tablename:) + @database = database + @tablename = tablename + end + + private + + def country_lookup_table + @country_lookup_table ||= YAML.load_file('country_lookup_table.yml') + end + + def get_country_id(country) + rows = database.execute("SELECT id FROM countries WHERE alpha2 = '#{country}'") + rows[0].nil? ? nil : rows[0].first + end + + def get_state_id(state) + rows = database.execute("SELECT id FROM states WHERE abbr = '#{state}'") + rows[0].nil? ? nil : rows[0].first + end + + def get_county_id(county) + return nil if county.nil? + sql = "SELECT id FROM counties WHERE name = '#{escape_single_quotes(county)}'" + rows = database.execute(sql) + rows[0].nil? ? nil : rows[0].first + rescue SQLite3::SQLException => err + raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]" + end + + def escape_single_quotes(string) + string&.gsub(/[']/, '\'\'') || '' + end + end +end diff --git a/lib/free_zipcode_data/logger.rb b/lib/free_zipcode_data/logger.rb new file mode 100644 index 0000000..59e6af1 --- /dev/null +++ b/lib/free_zipcode_data/logger.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +require 'singleton' +require 'logger' + +module FreeZipcodeData + class Logger + include Singleton + + attr_accessor :log_provider + + def initialize(provider = default_logger) + @log_provider = provider + end + + def log_exception(e, data = {}) + msg = "EXCEPTION : #{e.class.name} : #{e.message}" + msg += "\n data : #{data.inspect}" if data && !data.empty? + msg += "\n #{e.backtrace[0, 6].join("\n ")}" + log_provider.error(msg) + end + + def method_missing(meth, *args, &block) + if log_provider.respond_to?(meth) + log_provider.send(meth, *args, &block) + else + super + end + end + + def respond_to?(meth, include_private = false) + if log_provider.respond_to?(meth) + true + else + super + end + end + + def verbose(msg) + info(msg) if options&.verbose + end + + private + + def default_logger + logger = ::Logger.new(STDOUT) + logger.formatter = proc do |_, _, _, msg| + "#{msg}\n" + end + logger + end + + def options + Options.instance.hash + end + end +end diff --git a/lib/free_zipcode_data/options.rb b/lib/free_zipcode_data/options.rb new file mode 100644 index 0000000..10d82b7 --- /dev/null +++ b/lib/free_zipcode_data/options.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +require 'singleton' + +module FreeZipcodeData + class Options + include Singleton + + def initialize_hash(hash) + @@_options = hash + end + + def [](key) + @@_options[key] + end + + def hash + @@_options + end + end +end diff --git a/lib/free_zipcode_data/runner.rb b/lib/free_zipcode_data/runner.rb new file mode 100644 index 0000000..d96a43d --- /dev/null +++ b/lib/free_zipcode_data/runner.rb @@ -0,0 +1,132 @@ +# frozen_string_literal: true + +require 'colored' +require 'trollop' +require 'kiba' + +require_relative '../etl/free_zipcode_data_job' + +require 'pry' if ENV.fetch('APP_ENV') == 'development' + +module FreeZipcodeData + class Runner + attr_accessor :logger, :options + + # Make a singleton but allow the class to be instantiated for easier testing + def self.instance + @instance || new + end + + def initialize + @logger = Logger.instance + end + + def start + start_time = Time.now + options = FreeZipcodeData::Options.instance + options.initialize_hash(collect_args) + + logger.info('Starting FreeZipcodeData...'.green) + + datasource = DataSource.new(options.hash.country) + datasource.download + + database = SqliteRam.new(File.join(options.hash.work_dir, 'free_zipcode_data.sqlite3')) + + %i[country state county zipcode].each { |t| initialize_table(t, database) } + + extract_transform_load(datasource, database) + + database.save_to_disk + + elapsed = Time.now - start_time + logger.info("Finished in [#{elapsed}] seconds.".yellow) + end + + private + + def initialize_table(table_sym, database) + options = Options.instance.hash + tablename = options["#{table_sym}_tablename".to_sym] + logger.verbose("Initializing #{table_sym} table: '#{tablename}'...") + klass = instance_eval("#{titleize(table_sym)}Table", __FILE__, __LINE__) + table = klass.new( + database: database.conn, + tablename: tablename + ) + table.build + end + + def extract_transform_load(datasource, database) + job = ETL::FreeZipcodeDataJob.setup( + datasource.datafile, + database.conn, + logger, + FreeZipcodeData::Options.instance.hash + ) + Kiba.run(job) + end + + # rubocop:disable Metrics/BlockLength + # rubocop:disable Metrics/MethodLength + def collect_args + Trollop.options do + opt( + :country, + 'Specify the country code for processing, or all countries if not specified', + type: :string, required: false, short: '-g' + ) + opt( + :work_dir, + 'Specify your work/build directory, where the SQLite and .csv files will be built', + type: :string, required: true, short: '-w' + ) + opt( + :country_tablename, + 'Specify the name for the `countries` table', + type: :string, required: false, default: 'countries' + ) + opt( + :state_tablename, + 'Specify the name for the `states` table', + type: :string, required: false, default: 'states' + ) + opt( + :county_tablename, + 'Specify the name for the `counties` table', + type: :string, required: false, default: 'counties' + ) + opt( + :zipcode_tablename, + 'Specify the name for the `zipcodes` table', + type: :string, required: false, default: 'zipcodes' + ) + opt( + :clobber, + 'Overwrite existing files', + type: :boolean, required: false, short: '-c', default: false + ) + opt( + :dry_run, + 'Do not actually move or copy files', + type: :boolean, required: false, short: '-d', + default: false + ) + opt( + :verbose, + 'Be verbose with output', + type: :boolean, required: false, short: '-v', + default: false + ) + end + end + # rubocop:enable Metrics/MethodLength + # rubocop:enable Metrics/BlockLength + + def titleize(string) + ret = string.to_s.dup + ret[0] = ret[0].capitalize + ret + end + end +end diff --git a/lib/free_zipcode_data/sqlite_ram.rb b/lib/free_zipcode_data/sqlite_ram.rb new file mode 100644 index 0000000..f16a818 --- /dev/null +++ b/lib/free_zipcode_data/sqlite_ram.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +require 'sqlite3' + +# Open a SQlite DB, work with it in-memory and save back to disk +class SqliteRam + attr_reader :filename, :conn + + def initialize(sqlite_filename) + @filename = sqlite_filename + @ram_db = SQLite3::Database.new(':memory:') + @file_db = SQLite3::Database.new(sqlite_filename) + @conn = @ram_db + end + + def save_to_disk + backup = SQLite3::Backup.new(@file_db, 'main', @ram_db, 'main') + backup.step(-1) + backup.finish + end +end diff --git a/lib/free_zipcode_data/state_table.rb b/lib/free_zipcode_data/state_table.rb new file mode 100644 index 0000000..497d96f --- /dev/null +++ b/lib/free_zipcode_data/state_table.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require_relative 'db_table' + +module FreeZipcodeData + class StateTable < DbTable + def build + schema = <<-SQL + create table #{tablename} ( + id integer not null primary key, + country_id integer not null, + abbr varchar(2) not null, + name varchar(255) + ) + SQL + database.execute_batch(schema) + + ndx = <<-SQL + CREATE UNIQUE INDEX "main"."unique_state" + ON #{tablename} (abbr, country_id COLLATE NOCASE ASC); + SQL + database.execute_batch(ndx) + end + + def write(row) + return nil unless row[:short_state] + row[:state] = 'Marshall Islands' if row[:short_state] == 'MH' && row[:state].nil? + country_id = get_country_id(row[:country]) + sql = <<-SQL + INSERT INTO states (abbr, name, country_id) + VALUES ('#{row[:short_state]}', + '#{escape_single_quotes(row[:state])}', + #{country_id} + ) + SQL + begin + database.execute(sql) + rescue SQLite3::ConstraintException + # Swallow duplicates + end + end + end +end diff --git a/lib/free_zipcode_data/version.rb b/lib/free_zipcode_data/version.rb new file mode 100644 index 0000000..0b28193 --- /dev/null +++ b/lib/free_zipcode_data/version.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +module FreeZipcodeData + VERSION = '1.0.0' +end diff --git a/lib/free_zipcode_data/zipcode_table.rb b/lib/free_zipcode_data/zipcode_table.rb new file mode 100644 index 0000000..5cb6987 --- /dev/null +++ b/lib/free_zipcode_data/zipcode_table.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +require_relative 'db_table' + +module FreeZipcodeData + class ZipcodeTable < DbTable + def build + schema = <<-SQL + create table #{tablename} ( + id integer not null primary key, + code varchar(10) not null, + state_id integer, + county_id integer, + city varchar(255), + area_code varchar(3), + lat float, + lon float, + accuracy varchar(8) + ) + SQL + database.execute_batch(schema) + + ndx = <<-SQL + CREATE UNIQUE INDEX "main"."unique_zipcode" + ON #{tablename} (state_id, code, city COLLATE NOCASE ASC); + SQL + database.execute_batch(ndx) + end + + def write(row) + return nil unless row[:postal_code] + + state_id = get_state_id(row[:short_state]) + county_id = get_county_id(row[:county]) + city_name = escape_single_quotes(row[:city]) + + sql = <<-SQL + INSERT INTO zipcodes (code, state_id, county_id, city, lat, lon, accuracy) + VALUES ('#{row[:postal_code]}', + '#{state_id}', + '#{county_id}', + '#{city_name}', + '#{row[:latitude]}', + '#{row[:longitude]}', + '#{row[:accuracy]}' + ) + SQL + + begin + database.execute(sql) + rescue SQLite3::ConstraintException => _err + # there are some duplicates - swallow them + rescue StandardError => err + raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]" + end + end + end +end diff --git a/lib/tasks/data.rake b/lib/tasks/data.rake deleted file mode 100644 index 4405a04..0000000 --- a/lib/tasks/data.rake +++ /dev/null @@ -1,325 +0,0 @@ -require 'rubygems' -require 'sqlite3' -require 'csv' -require 'open-uri' -require 'zip' -require 'yaml' -require 'midwire_common/string' -# require 'pry' - -# rubocop:disable Metrics/BlockLength -namespace :data do - - desc 'Download the specified data from GeoNames' - task :download, [:country] do |_t, args| - BASE_URL = 'http://download.geonames.org/export/zip'.freeze - - # create the download directory - FileUtils.mkdir_p(data_dir) - - # determine which file to get - zipfile = country_zipfile(args[:country]) - puts(">>> Downloading: #{zipfile} from GeoNames...") - - # download the file - open("#{data_dir}/#{zipfile}", 'wb') do |file| - file << open("#{BASE_URL}/#{zipfile}").read - end - end - - # desc 'Delete the sqlite db' - task :kill_db do - FileUtils.rm_f('free_zipcode_data.sqlite3') - end - - desc 'Build the data files. Downloads missing files.' - task :build, [:country] do |_t, args| - # create the build directory - FileUtils.mkdir_p(build_dir) - - # determine the zipfile path - zipfile = File.join(data_dir, country_zipfile(args[:country])) - - # download the zipfile if it doesn't exist - Rake::Task['data:download'].invoke(args[:country]) unless File.exist?(zipfile) - - # extract the .tsv files - puts('>>> Extracting zipfile...') - Zip.on_exists_proc = true - country_file = nil - Zip::File.open(zipfile) do |zip| - zip.each do |entry| - next if entry.name =~ /readme/i - country_file = File.join(build_dir, entry.name) - entry.extract(country_file) - break - end - end - - # country code 0: iso country code, 2 characters - # postal code 1: varchar(20) - # place name 2: varchar(180) - # admin name1 3: 1. order subdivision (state) varchar(100) - # admin code1 4: 1. order subdivision (state) varchar(20) - # admin name2 5: 2. order subdivision (county/province) varchar(100) - # admin code2 6: 2. order subdivision (county/province) varchar(20) - # admin name3 7: 3. order subdivision (community) varchar(100) - # admin code3 8: 3. order subdivision (community) varchar(20) - # latitude 9: estimated latitude (wgs84) - # longitude 10: estimated longitude (wgs84) - # accuracy 11: accuracy of lat/lng from 1=estimated to 6=centroid - - puts('>>> Writing CSV file...') - CSV.open(output_file(args[:country]), 'w') do |outfile| - # write the header - outfile << %w[COUNTRY POSTAL_CODE CITY STATE SHORT_STATE COUNTY SHORT_COUNTY COMMUNITY SHORT_COMMUNITY LATITUDE LONGITUDE ACCURACY] - CSV.foreach(country_file, headers: false, col_sep: "\t", quote_char: '|') do |row| - outfile << row - end - end - - # delete the extracted file - FileUtils.rm(country_file) - - # build sqlite db - FileUtils.rm_f 'free_zipcode_data.sqlite3' - end - - # desc 'Create countries table' - task create_countries_table: :kill_db do - schema = <<-STOP.here_with_pipe - |create table countries ( - | id integer not null primary key, - | alpha2 varchar(2) not null, - | alpha3 varchar(3), - | iso varchar(3), - | name varchar(255) not null - |) - STOP - database.execute_batch(schema) - ndx = <<-STOP.here_with_pipe(' ') - |CREATE UNIQUE INDEX "main"."unique_country_alpha2" - |ON countries (alpha2 COLLATE NOCASE ASC); - STOP - database.execute_batch(ndx) - end - - # desc 'Create states table' - task create_states_table: :create_countries_table do - schema = <<-STOP.here_with_pipe - |create table states ( - | id integer not null primary key, - | country_id integer not null, - | abbr varchar(2) not null, - | name varchar(255) - |) - STOP - database.execute_batch(schema) - ndx = <<-STOP.here_with_pipe(' ') - |CREATE UNIQUE INDEX "main"."unique_state" - |ON states (abbr, country_id COLLATE NOCASE ASC); - STOP - database.execute_batch(ndx) - end - - # desc 'Create counties table' - task create_counties_table: :create_states_table do - schema = <<-STOP.here_with_pipe - |create table counties ( - | id integer not null primary key, - | state_id integer, - | abbr varchar(255), - | name varchar(255), - | county_seat varchar(255) - |) - STOP - database.execute_batch(schema) - end - - # desc 'Create zipcodes table' - task create_zipcodes_table: :create_counties_table do - schema = <<-STOP.here_with_pipe - |create table zipcodes ( - | id integer not null primary key, - | code varchar(10) not null, - | state_id integer, - | county_id integer, - | city varchar(255), - | area_code varchar(3), - | lat float, - | lon float, - | accuracy varchar(8) - |) - STOP - database.execute_batch(schema) - ndx = <<-STOP.here_with_pipe(' ') - |CREATE UNIQUE INDEX "main"."unique_zipcode" - |ON zipcodes (state_id, code, city COLLATE NOCASE ASC); - STOP - database.execute_batch(ndx) - end - - desc 'Populate an sqlite DB' - task :populate_db, [:country] => [:create_zipcodes_table] do |_t, args| - start_time = Time.now - puts '>>> Building SQLite3 DB...' - - csvfile = output_file(args[:country]) - - # run the build task if the data is missing for the passed country - Rake::Task['data:build'].invoke(args[:country]) unless File.exist?(csvfile) - - last_country = nil - count = 0 - - # Countries - CSV.foreach(csvfile, headers: true) do |row| - country_hash = country_lookup_table[row['COUNTRY']] - puts(">>> #{country_hash[:name]}") if last_country != country_hash[:name] - last_country = country_hash[:name] - puts(">>> COUNT: #{count}") if (count % 10000).zero? - count += 1 - - # insert country - sql = <<-STOP.here_with_pipe(' ') - |INSERT INTO countries (alpha2, alpha3, iso, name) - |VALUES ('#{row['COUNTRY']}', - | '#{country_hash[:alpha3]}', - | '#{country_hash[:iso]}', - | '#{country_hash[:name]}') - STOP - begin - database.execute(sql) - rescue SQLite3::ConstraintException - # next - end - - # state - if row['STATE'] - country_id = get_country_id(row['COUNTRY']) - sql = <<-STOP.here_with_pipe(' ') - |INSERT INTO states (abbr, name, country_id) - |VALUES ('#{row['SHORT_STATE']}', - | '#{escape_single_quotes(row['STATE'])}', - | #{country_id} - |) - STOP - begin - database.execute(sql) - rescue StandardError => err - # next - end - end - - # county - if row['COUNTY'] - state_id = get_state_id(row['SHORT_STATE']) - sql = <<-STOP.here_with_pipe(' ') - |INSERT INTO counties (state_id, abbr, name) - |VALUES ('#{state_id}', - | '#{row['SHORT_COUNTY']}', - | '#{escape_single_quotes(row['COUNTY'])}' - |) - STOP - begin - database.execute(sql) - rescue StandardError => err - raise "Please file an issue at https://github.com/midwire/free_zipcode_data/issues/new: [#{err}] -> SQL: [#{sql}]" - end - end - - # zipcode - if row['POSTAL_CODE'] - state_id = get_state_id(row['SHORT_STATE']) - county_id = get_county_id(row['COUNTY']) - city_name = escape_single_quotes(row['CITY']) - sql = <<-STOP.here_with_pipe(' ') - |INSERT INTO zipcodes (code, state_id, county_id, city, lat, lon, accuracy) - |VALUES ('#{row['POSTAL_CODE']}', - | '#{state_id}', - | '#{county_id}', - | '#{city_name}', - | '#{row['LATITUDE']}', - | '#{row['LONGITUDE']}', - | '#{row['ACCURACY']}' - |) - STOP - begin - database.execute(sql) - rescue SQLite3::ConstraintException => err - # there are some duplicates - rescue StandardError => err - raise "Please file an issue at https://github.com/midwire/free_zipcode_data/issues/new: [#{err}] -> SQL: [#{sql}]" - end - else - puts(">>> Missing Postal Code: #{row}") - end - end - - end_time = Time.now - start_time - puts ">>>> Completed in #{end_time} seconds" - end - - private - - def database - @db ||= SQLite3::Database.new('free_zipcode_data.sqlite3') - end - - def get_country_id(country) - rows = database.execute("SELECT id FROM countries WHERE alpha2 = '#{country}'") - rows[0].nil? ? nil : rows[0].first - end - - def get_state_id(state) - rows = database.execute("SELECT id FROM states WHERE abbr = '#{state}'") - rows[0].nil? ? nil : rows[0].first - end - - def get_county_id(county) - return nil if county.nil? - sql = "SELECT id FROM counties WHERE name = '#{escape_single_quotes(county)}'" - rows = database.execute(sql) - rows[0].nil? ? nil : rows[0].first - rescue SQLite3::SQLException => err - raise "Please file an issue at https://github.com/midwire/free_zipcode_data/issues/new: [#{err}] -> SQL: [#{sql}]" - end - - def escape_single_quotes(string) - string.gsub(/[']/, '\'\'') - end - - def root - Pathname.new(File.dirname(__FILE__)).parent.parent - end - - def data_dir - File.join(root, 'data') - end - - def build_dir - File.join(root, 'build') - end - - def country_zipfile(country) - filename = country.nil? ? 'allCountries' : country.upcase - filename += '.zip' unless filename =~ /\.zip$/ - filename - end - - def country_csvfile(country) - filename = country.nil? ? 'all_countries' : country.downcase - filename += '.csv' unless filename =~ /\.csv$/ - filename - end - - def output_file(country) - filename = country.nil? ? 'all_countries.csv' : "#{country.downcase}.csv" - File.join(build_dir, filename) - end - - def country_lookup_table - @country_lookup_table ||= YAML.load_file('country_lookup_table.yml') - end -end -# rubocop:enable Metrics/BlockLength diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..5b83a36 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +ENV['GEM_ENV'] = 'test' + +require 'pry' + +Dir[root.join('spec/support/**/*.rb')].each { |f| require f } + +RSpec.configure do |config| + config.expect_with :rspec do |expectations| + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + config.mock_with :rspec do |mocks| + mocks.verify_partial_doubles = true + end + + config.shared_context_metadata_behavior = :apply_to_host_groups + + config.filter_run_when_matching :focus + + config.example_status_persistence_file_path = 'spec/examples.txt' + + config.disable_monkey_patching! + + config.warnings = true + + config.default_formatter = 'doc' if config.files_to_run.one? + + config.profile_examples = 3 + + config.order = :random + + Kernel.srand config.seed +end