Skip to content

Commit

Permalink
Finish refactor
Browse files Browse the repository at this point in the history
* Add a progressbar
* Fix a bug when looking up state_id
* Add a ‘name’ index on states table
* Add a switch to generate individual csv files
* Refactor a few more things
  • Loading branch information
cblackburn-ajla committed Apr 23, 2018
1 parent ba54bc6 commit ada8e60
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 28 deletions.
3 changes: 3 additions & 0 deletions .rubocop.yml
Expand Up @@ -31,6 +31,9 @@ CommentAnnotation:
########################################
# Style Cops

Style/ClassVars:
Enabled: false

Style/Documentation:
Enabled: false

Expand Down
3 changes: 3 additions & 0 deletions Gemfile.lock
Expand Up @@ -4,6 +4,7 @@ PATH
free_zipcode_data (1.0.0)
colored (~> 1.2)
kiba (~> 2.0)
ruby-progressbar (~> 1.9)
rubyzip (~> 1.2)
sqlite3 (~> 1.3)
trollop (~> 2.1)
Expand Down Expand Up @@ -51,6 +52,7 @@ GEM
rainbow (>= 2.2.2, < 4.0)
ruby-progressbar (~> 1.7)
unicode-display_width (~> 1.0, >= 1.0.1)
ruby-prof (0.17.0)
ruby-progressbar (1.9.0)
rubyzip (1.2.1)
simplecov (0.16.1)
Expand All @@ -73,6 +75,7 @@ DEPENDENCIES
rake (~> 12.0)
rspec (~> 3.7)
rubocop
ruby-prof
simplecov

BUNDLED WITH
Expand Down
6 changes: 5 additions & 1 deletion free_zipcode_data.gemspec
Expand Up @@ -5,11 +5,12 @@ lib = File.expand_path('../lib', __FILE__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'free_zipcode_data/version'

# rubocop:disable Metrics/BlockLength
Gem::Specification.new do |spec|
spec.name = 'free_zipcode_data'
spec.version = FreeZipcodeData::VERSION
spec.authors = ['Chris Blackburn', 'Chris McKnight']
spec.email = ['87a1779b@opayq.com']
spec.email = ['87a1779b@opayq.com', 'fixme@mcknight.bogus']
spec.summary = 'Free US postal codes in CSV and SQLite3 format.'
spec.description = spec.summary
spec.homepage = 'https://github.com/midwire/free_zipcode_data'
Expand All @@ -26,11 +27,14 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'rake', '~> 12.0'
spec.add_development_dependency 'rspec', '~> 3.7'
spec.add_development_dependency 'rubocop'
spec.add_development_dependency 'ruby-prof'
spec.add_development_dependency 'simplecov'

spec.add_runtime_dependency 'colored', '~> 1.2'
spec.add_runtime_dependency 'kiba', '~> 2.0'
spec.add_runtime_dependency 'ruby-progressbar', '~> 1.9'
spec.add_runtime_dependency 'rubyzip', '~> 1.2'
spec.add_runtime_dependency 'sqlite3', '~> 1.3'
spec.add_runtime_dependency 'trollop', '~> 2.1'
end
# rubocop:enable Metrics/BlockLength
3 changes: 3 additions & 0 deletions lib/etl/free_zipcode_data_job.rb
Expand Up @@ -30,6 +30,9 @@ def setup(country_file, database, logger, options)
database: database,
tablename: options[:zipcode_tablename]

post_process do
logger.verbose('Finished generating table data...')
end
end
end
end
Expand Down
2 changes: 2 additions & 0 deletions lib/free_zipcode_data/country_table.rb
Expand Up @@ -39,6 +39,8 @@ def write(row)
rescue SQLite3::ConstraintException
# Swallow duplicates
end

update_progress
end
end
end
7 changes: 5 additions & 2 deletions lib/free_zipcode_data/county_table.rb
Expand Up @@ -25,8 +25,9 @@ def build

def write(row)
return nil unless row[:county]
state_id = get_state_id(row[:short_state])
raise "Could not find state: #{row[:short_state]}" unless state_id
state_id = get_state_id(row[:short_state], row[:state])
return nil unless state_id

sql = <<-SQL
INSERT INTO counties (state_id, abbr, name)
VALUES ('#{state_id}',
Expand All @@ -42,6 +43,8 @@ def write(row)
rescue StandardError => err
raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]"
end

update_progress
end
end
end
31 changes: 22 additions & 9 deletions lib/free_zipcode_data/db_table.rb
@@ -1,16 +1,24 @@
# frozen_string_literal: true

require 'yaml'
require 'ruby-progressbar'

module FreeZipcodeData
class DbTable
ISSUE_URL = 'https://github.com/midwire/free_zipcode_data/issues/new'

attr_reader :database, :tablename
@@progressbar = nil

def initialize(database:, tablename:)
@database = database
@tablename = tablename
lc = select_first('SELECT value FROM meta where name = "line_count"')
@@progressbar = ProgressBar.create(total: lc.to_i * 4, format: '%t: |%B| %e')
end

def update_progress
@@progressbar.increment
end

private
Expand All @@ -19,23 +27,28 @@ def country_lookup_table
@country_lookup_table ||= YAML.load_file('country_lookup_table.yml')
end

def get_country_id(country)
rows = database.execute("SELECT id FROM countries WHERE alpha2 = '#{country}'")
def select_first(sql)
rows = database.execute(sql)
rows[0].nil? ? nil : rows[0].first
rescue SQLite3::SQLException => err
raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]"
end

def get_state_id(state)
rows = database.execute("SELECT id FROM states WHERE abbr = '#{state}'")
rows[0].nil? ? nil : rows[0].first
def get_country_id(country)
sql = "SELECT id FROM countries WHERE alpha2 = '#{country}'"
select_first(sql)
end

def get_state_id(state_abbr, state_name)
sql = "SELECT id FROM states
WHERE abbr = '#{state_abbr}' OR name = '#{escape_single_quotes(state_name)}'"
select_first(sql)
end

def get_county_id(county)
return nil if county.nil?
sql = "SELECT id FROM counties WHERE name = '#{escape_single_quotes(county)}'"
rows = database.execute(sql)
rows[0].nil? ? nil : rows[0].first
rescue SQLite3::SQLException => err
raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]"
select_first(sql)
end

def escape_single_quotes(string)
Expand Down
68 changes: 54 additions & 14 deletions lib/free_zipcode_data/runner.rb
Expand Up @@ -6,9 +6,10 @@

require_relative '../etl/free_zipcode_data_job'

require 'pry' if ENV.fetch('APP_ENV') == 'development'
require 'pry' if ENV.fetch('APP_ENV', '') == 'development'

module FreeZipcodeData
# rubocop:disable Metrics/ClassLength
class Runner
attr_accessor :logger, :options

Expand All @@ -23,30 +24,38 @@ def initialize

def start
start_time = Time.now
options = FreeZipcodeData::Options.instance
options.initialize_hash(collect_args)
opt = FreeZipcodeData::Options.instance
opt.initialize_hash(collect_args)
@options = opt.hash

logger.info('Starting FreeZipcodeData...'.green)
logger.info("Starting FreeZipcodeData v#{VERSION}...".green)

datasource = DataSource.new(options.hash.country)
datasource = DataSource.new(options.country)
datasource.download

database = SqliteRam.new(File.join(options.hash.work_dir, 'free_zipcode_data.sqlite3'))
db_file = File.join(options.work_dir, 'free_zipcode_data.sqlite3')
database = SqliteRam.new(db_file)
configure_meta(database.conn, datasource.datafile)

%i[country state county zipcode].each { |t| initialize_table(t, database) }

extract_transform_load(datasource, database)

logger.info("Saving database to disk '#{db_file}'...")
database.save_to_disk

elapsed = Time.now - start_time
logger.info("Finished in [#{elapsed}] seconds.".yellow)
if options.generate_files
logger.info('Generating .csv files...')
database.dump_tables(options.work_dir)
end

elapsed = Time.at(Time.now - start_time).utc.strftime('%H:%M:%S')
logger.info("Processed #{datasource_line_count} zipcodes in [#{elapsed}].".yellow)
end

private

def initialize_table(table_sym, database)
options = Options.instance.hash
tablename = options["#{table_sym}_tablename".to_sym]
logger.verbose("Initializing #{table_sym} table: '#{tablename}'...")
klass = instance_eval("#{titleize(table_sym)}Table", __FILE__, __LINE__)
Expand All @@ -57,12 +66,37 @@ def initialize_table(table_sym, database)
table.build
end

def datasource_line_count(filename)
@datasource_line_count ||= begin
count = File.foreach(filename).inject(0) { |c, _line| c + 1 }
logger.verbose("Processing #{count} zipcodes in '#{filename}'...")
count
end
end

def configure_meta(database, datasource)
schema = <<-SQL
create table meta (
id integer not null primary key,
name varchar(255),
value varchar(255)
)
SQL
database.execute_batch(schema)

sql = <<-SQL
INSERT INTO meta (name, value)
VALUES ('line_count', #{datasource_line_count(datasource)})
SQL
database.execute(sql)
end

def extract_transform_load(datasource, database)
job = ETL::FreeZipcodeDataJob.setup(
datasource.datafile,
database.conn,
logger,
FreeZipcodeData::Options.instance.hash
options
)
Kiba.run(job)
end
Expand All @@ -71,15 +105,20 @@ def extract_transform_load(datasource, database)
# rubocop:disable Metrics/MethodLength
def collect_args
Trollop.options do
opt(
:work_dir,
'REQUIRED: Specify your work/build directory, where the SQLite and .csv files will be built',
type: :string, required: true, short: '-w'
)
opt(
:country,
'Specify the country code for processing, or all countries if not specified',
type: :string, required: false, short: '-g'
type: :string, required: false, short: '-f'
)
opt(
:work_dir,
'Specify your work/build directory, where the SQLite and .csv files will be built',
type: :string, required: true, short: '-w'
:generate_files,
'Generate CSV files: [counties.csv, states.csv, countries.csv, zipcodes.csv]',
type: :boolean, required: false, short: '-g', default: false
)
opt(
:country_tablename,
Expand Down Expand Up @@ -129,4 +168,5 @@ def titleize(string)
ret
end
end
# rubocop:enable Metrics/ClassLength
end
18 changes: 18 additions & 0 deletions lib/free_zipcode_data/sqlite_ram.rb
@@ -1,6 +1,7 @@
# frozen_string_literal: true

require 'sqlite3'
require 'csv'

# Open a SQlite DB, work with it in-memory and save back to disk
class SqliteRam
Expand All @@ -18,4 +19,21 @@ def save_to_disk
backup.step(-1)
backup.finish
end

def dump_tables(path)
tables = conn.execute('select name from sqlite_master where type = "table"')
sql = nil
tables.each do |table_array|
table = table_array.first
headers_sql = "pragma table_info('#{table}')"
header = conn.execute(headers_sql).map { |e| e[1] }
CSV.open(File.join(path, "#{table}.csv"), 'w') do |csv|
csv << header
sql = "select * from #{table}"
conn.execute(sql).each do |row_array|
csv << row_array
end
end
end
end
end
8 changes: 8 additions & 0 deletions lib/free_zipcode_data/state_table.rb
Expand Up @@ -20,6 +20,12 @@ def build
ON #{tablename} (abbr, country_id COLLATE NOCASE ASC);
SQL
database.execute_batch(ndx)

ndx = <<-SQL
CREATE UNIQUE INDEX "main"."state_name"
ON #{tablename} (name COLLATE NOCASE ASC);
SQL
database.execute_batch(ndx)
end

def write(row)
Expand All @@ -38,6 +44,8 @@ def write(row)
rescue SQLite3::ConstraintException
# Swallow duplicates
end

update_progress
end
end
end
6 changes: 4 additions & 2 deletions lib/free_zipcode_data/zipcode_table.rb
Expand Up @@ -30,12 +30,12 @@ def build
def write(row)
return nil unless row[:postal_code]

state_id = get_state_id(row[:short_state])
state_id = get_state_id(row[:short_state], row[:state])
county_id = get_county_id(row[:county])
city_name = escape_single_quotes(row[:city])

sql = <<-SQL
INSERT INTO zipcodes (code, state_id, county_id, city, lat, lon, accuracy)
INSERT INTO zipcodes (code, state_id, city, lat, lon, accuracy)
VALUES ('#{row[:postal_code]}',
'#{state_id}',
'#{county_id}',
Expand All @@ -53,6 +53,8 @@ def write(row)
rescue StandardError => err
raise "Please file an issue at #{ISSUE_URL}: [#{err}] -> SQL: [#{sql}]"
end

update_progress
end
end
end

0 comments on commit ada8e60

Please sign in to comment.