-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Import authorities #4
Comments
Be good to wikify the scripts somehow |
Didn't use an ignore file or do any manual cleanup (other than what you see in the script below): #!/usr/bin/env ruby
# -*- encoding : utf-8 -*-
require 'csv'
# gem install unicode_utils
require 'unicode_utils/downcase'
require 'unicode_utils/titlecase'
def main(csv, ignore)
ignore_list = ignore ? File.read(ignore).split("\n") : []
puts print_new_csv(clean_csv(csv, ignore_list))
end
def clean_csv(csv, ignore_list = [])
rows = []
CSV.foreach(csv, headers: true, header_converters: :symbol) do |row|
next if ignore_list.include?(row[:name])
cleaned_data = {}
cleaned_data[:name] = clean_name(row[:nombre_institucion].to_s)
cleaned_data[:short_name] = clean_short_name(row[:sigla].to_s)
cleaned_data[:request_email] = clean_email(row[:email].to_s)
cleaned_data[:home_page] = clean_homepage(row[:url].to_s)
cleaned_data[:notes] = make_notes(row[:direccion].to_s)
cleaned_data[:tag_string] = make_tag_string(row[:sector].to_s)
manually_cleaned_data = manual_fixups(cleaned_data)
rows << manually_cleaned_data unless manually_cleaned_data.nil?
end
rows
end
def clean_name(str)
UnicodeUtils.titlecase(str)
end
def clean_short_name(str)
#if str.empty?
#nil
#else
#UnicodeUtils.downcase(str)
#end
# Too many duplicates to handle
nil
end
def clean_email(str)
return nil if str.empty?
str = str.split(' ').first.strip.gsub(',', '')
str = str.split(';').first.strip
str.gsub!(/\W\z/,'')
if str.reverse.chr =~ /\W/
raise "Email included non-word at end: #{str}"
else
str
end
end
def clean_homepage(str)
return nil if str.empty?
fixed = if str.start_with?('http')
str
else
"http://#{ str }"
end
fixed.strip
end
def make_notes(address)
unless address.empty?
%Q(<strong>Direccion:</strong> #{ address.strip })
end
end
def make_tag_string(str)
return nil if str.strip.downcase == 'no aplica' || str.empty?
str.split(', ').map { |tag| UnicodeUtils.downcase(tag.gsub(' ','_')) }.join(' ')
end
def manual_fixups(data)
if data[:name] == 'Personeria Municipal Jericó' && data[:home_page] == 'http://www.jerico-antioquia.gov.co/'
data[:name] = 'Personeria Municipal Jericó - Antioquia'
return data
end
if data[:name] == 'Personeria Municipal Jericó' && data[:home_page] == 'http://www.jerico-boyaca.gov.co'
data[:name] = 'Personeria Municipal Jericó - Boyacá'
return data
end
if data[:name] == 'Personeria Municipal San Francisco' && data[:home_page] == 'http://sanfrancisco-antioquia.gov.co/Personeria.shtml'
data[:name] = 'Personeria Municipal San Francisco - Antioquia'
return data
end
if data[:name] == 'Personeria Municipal San Francisco' && data[:home_page] == 'http://www.sanfrancisco-putumayo.gov.co'
data[:name] = 'Personeria Municipal San Francisco - Putumayo'
return data
end
if data[:name] == 'Secretaria Distrital De Salud' && data[:request_email] == 'contabilidad@shd.gov.co'
# Duplicate
return nil
end
if data[:name] == 'E.s.e. Hospital El Carmen -Amalfi' && data[:request_email] == 'paramillo209@gmail.com'
# Duplicate
return nil
end
if data[:name] == 'E.s.e Hospital Local Del Bolivar -Santander' && data[:request_email] == nil
# Duplicate
return nil
end
if data[:name] == 'E.s.e. Hospital San Antonio - Natagaima' && data[:notes] == %q(<strong>Direccion:</strong> Calle 5 con Carrera 11)
# Duplicate
return nil
end
if data[:name] == 'E.s.e Hospital Universitario Del Caribe' && data[:request_email].nil?
# Duplicate
return nil
end
if data[:name] == 'E.s.p Empresa De Servicios Publicos La Union'
# Duplicate
return nil
end
if data[:name] == 'Personeria Municipal De Belén Boyacá'
# Duplicate
return nil
end
if data[:name] == 'Personería Municipal El Zulia'
# Duplicate
return nil
end
if data[:name] == 'Personeria Municipal Soplaviento' && data[:request_email] == 'miladisbc25@yahoo.es'
# Duplicate
return nil
end
if data[:name] == 'E.s.e Centro De Salud San Miguel' && data[:request_email] == '0@0'
# duplicate; invalid email
return nil
end
if data[:name] == 'E.s.e. Centro De Salud San Sebastián' && data[:notes] == '<strong>Direccion:</strong> calle 2 # 4 - 64'
# Duplicate
return nil
end
if data[:name] == 'E.s.e Hospital Nuestra Señora Del Carmen'
# Not sure, so just add tag suffix
data[:name] = 'E.s.e Hospital Nuestra Señora Del Carmen tabio'
return data
end
if data[:name] == 'E.s.e Hospital Nuevo Horizonte' && data[:request_email] == 'luisfernandovargas2005@hotmail.com'
# Looks like a duplicate
return nil
end
if data[:name] == 'Personería Municipal Murillo' && data[:request_email].nil?
# Duplicate
return nil
end
if data[:request_email] == 'alcaldia@sandona_narino.gov.co'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'elbanco-magadalena.gov.co'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'alcaldía@nocaima-cundinamarca.gov.co'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'haroldcaleño2993@hotmail.com'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'Carrera'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'fmuñoz@esepaf.gov.co'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'janella-hotmail.com'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'http://indeportesguajira.gov.co/contactenos'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'alcaldía@gama-cundinamarca.gov.co'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'personeriamomil'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'www.electrohuila.com.co/Login.aspx?ReturnUrl=%2fP_Q_R.aspx'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'http://www.urra.com.co/Contacteno.php'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'www.conif.org.co/contactenos.php'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'corporación@parquearvi.org'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
data
end
def print_new_csv(data)
headers = data.first.keys
headers[0] = "##{ headers[0] }"
CSV.generate(headers: headers) do |csv|
csv << headers
data.each do |d|
csv << d.values
end
end
end
csv = ARGV[0]
ignore = ARGV[1]
if csv.nil? || !File.exist?(csv)
puts "File does not exist: #{ csv }"
exit 1
end
if ignore && !File.exist?(ignore)
puts "File does not exist: #{ ignore }"
exit 1
end
main(csv, ignore) |
A handful couldn't be imported because of mysociety/alaveteli#2957. |
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Source will need cleanup similar to mysociety/bilmehakki-theme#11
The text was updated successfully, but these errors were encountered: