diff --git a/README.md b/README.md new file mode 100644 index 0000000..089fd40 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# Welcome to the CSV Normalizer + +Using the normalizer is simple as naming the CSV you'd like to read from and write to: + +```sh +ruby ./normalizer.rb < sample.csv > output.csv +``` + +to test the broken utf-8 csv the command is + +```sh +ruby ./normalizer.rb < broken_sample.csv > output.csv +``` + +This PR should achieve the following conversations: + +- [ ] The entire CSV is in the UTF-8 character set. +- [ ] The Timestamp column should be formatted in ISO-8601 format. +- [ ] The Timestamp column should be assumed to be in US/Pacific time; please convert it to US/Eastern. +- [ ] All ZIP codes should be formatted as 5 digits. If there are less than 5 digits, assume 0 as the prefix. +- [ ] The FullName column should be converted to uppercase. There will be non-English names. +- [ ] The Address column should be passed through as is, except for Unicode validation. Please note there are commas in the Address field; your CSV parsing will need to take that into account. Commas will only be present inside a quoted string. +- [ ] The FooDuration and BarDuration columns are in HH:MM:SS.MS format (where MS is milliseconds); please convert them to the total number of seconds expressed in floating point format. You should not round the result. +- [ ] The TotalDuration column is filled with garbage data. For each row, please replace the value of TotalDuration with the sum of FooDuration and BarDuration. +- [ ] The Notes column is free form text input by end-users; please do not perform any transformations on this column. If there are invalid UTF-8 characters, please replace them with the Unicode Replacement Character. \ No newline at end of file diff --git a/broken_sample.csv b/broken_sample.csv new file mode 100644 index 0000000..6e80948 --- /dev/null +++ b/broken_sample.csv @@ -0,0 +1,10 @@ +Timestamp,Address,ZIP,FullName,FooDuration,BarDuration,TotalDuration,Notes +4/1/11 11:00:00 AM,"123 4th St, Anywhere, AA",94121,Monkey Alberto,1:23:32.123,1:32:33.123,zzsasdfa,I am the very model of a modern major general +3/12/14 12:00:00 AM,"Somewhere Else, In Another Time, BB",1,Superman übertan,111:23:32.123,1:32:33.123,zzsasdfa,This is some Unicode right h�xxx ü ¡! 😀 +2/29/16 12:11:11 PM,111 Ste. #123123123,1101,Résumé Ron,31:23:32.123,1:32:33.123,zzsasdfa,🏳️🏴🏳️🏴 +1/1/11 12:00:01 AM,"This Is Not An Address, BusyTown, BT",94121,Mary 1,1:23:32.123,0:00:00.000,zzsasdfa,I like Emoji! 🍏🍎😍 +12/31/16 11:59:59 PM,"123 Gangnam Style Lives Here, Gangnam Town",31403,Anticipation of Unicode Failure,1:23:32.123,1:32:33.123,zzsasdfa,I like Math Symbols! ≱≰⨌⊚ +11/11/11 11:11:11 AM,überTown,10001,Prompt Negotiator,1:23:32.123,1:32:33.123,zzsasdfa,"I’m just gonna say, this is AMAZING. WHAT NEGOTIATIONS." +5/12/10 4:48:12 PM,Høøük¡,1231,Sleeper Service,1:23:32.123,1:32:33.123,zzsasdfa,2/1/22 +10/5/12 10:31:11 PM,"Test Pattern Town, Test Pattern, TP",121,株式会社スタジオジブリ,1:23:32.123,1:32:33.123,zzsasdfa,1:11:11.123 +10/2/04 8:44:11 AM,The Moon,11,HERE WE GO,1:23:32.123,1:32:33.123,zzsasdfa, \ No newline at end of file diff --git a/normalizer.rb b/normalizer.rb new file mode 100644 index 0000000..8ed75f5 --- /dev/null +++ b/normalizer.rb @@ -0,0 +1,89 @@ +require 'csv' +require 'time' +require 'date' + +class Normalizer + def initialize + end + + def read_file + text = $stdin.read + text = encode_utf8(text) + csv = CSV.parse(text, headers: true) + new_thang = csv.map do |row| + sample = [] + sample << format_timestamp(row['Timestamp']) + sample << format_address(row['Address']) + sample << format_zip(row['ZIP']) + sample << format_fullname(row['FullName']) + sample << format_duration(row['FooDuration']) + sample << format_duration(row['BarDuration']) + sample << total_duration(row['FooDuration'], row['BarDuration']) + sample << remove_invalid_chars(row['Notes']) + end + + new_thang + end + + def encode_utf8(file) + file.encode('utf-8') + end + + def write_file + file = read_file + + output_file = CSV.generate do |sample| + file.each do |entry| + sample << entry + end + end + $stdout.print output_file + end + + def format_timestamp(time) + # need method to convert timezone to EST + begin + time_obj = DateTime.parse(time) + time_obj.iso8601 + rescue ArgumentError + # return to this, allowing valid leap year dates + time_obj = "Invalid Date" + end + end + + def format_zip(zipcode) + if zipcode.length < 5 + num = 5 - zipcode.length + zipcode.insert(0, ('0' * num)) + end + zipcode + end + + def format_fullname(fullname) + fullname.upcase + end + + + def format_address(address) + address = encode_utf8(address) + address.unicode_normalize + end + + def format_duration(duration) + duration.split(':').map { |a| a.to_f }.inject(0) { |a, b| a * 60 + b} + end + + def total_duration(foo, bar) + format_duration(foo) + format_duration(bar) + end + + def remove_invalid_chars(notes) + if notes.respond_to?(:encode) + notes.encode("UTF-8", invalid: :replace, undef: :replace) + end + end +end + +# Method Calls +normalizer = Normalizer.new +normalizer.write_file diff --git a/sample.csv b/sample.csv new file mode 100644 index 0000000..8050a3f --- /dev/null +++ b/sample.csv @@ -0,0 +1,10 @@ +Timestamp,Address,ZIP,FullName,FooDuration,BarDuration,TotalDuration,Notes +4/1/11 11:00:00 AM,"123 4th St, Anywhere, AA",94121,Monkey Alberto,1:23:32.123,1:32:33.123,zzsasdfa,I am the very model of a modern major general +3/12/14 12:00:00 AM,"Somewhere Else, In Another Time, BB",1,Superman übertan,111:23:32.123,1:32:33.123,zzsasdfa,This is some Unicode right here. ü ¡! 😀 +2/29/16 12:11:11 PM,111 Ste. #123123123,1101,Résumé Ron,31:23:32.123,1:32:33.123,zzsasdfa,🏳️🏴🏳️🏴 +1/1/11 12:00:01 AM,"This Is Not An Address, BusyTown, BT",94121,Mary 1,1:23:32.123,0:00:00.000,zzsasdfa,I like Emoji! 🍏🍎😍 +12/31/16 11:59:59 PM,"123 Gangnam Style Lives Here, Gangnam Town",31403,Anticipation of Unicode Failure,1:23:32.123,1:32:33.123,zzsasdfa,I like Math Symbols! ≱≰⨌⊚ +11/11/11 11:11:11 AM,überTown,10001,Prompt Negotiator,1:23:32.123,1:32:33.123,zzsasdfa,"I’m just gonna say, this is AMAZING. WHAT NEGOTIATIONS." +5/12/10 4:48:12 PM,Høøük¡,1231,Sleeper Service,1:23:32.123,1:32:33.123,zzsasdfa,2/1/22 +10/5/12 10:31:11 PM,"Test Pattern Town, Test Pattern, TP",121,株式会社スタジオジブリ,1:23:32.123,1:32:33.123,zzsasdfa,1:11:11.123 +10/2/04 8:44:11 AM,The Moon,11,HERE WE GO,1:23:32.123,1:32:33.123,zzsasdfa, \ No newline at end of file diff --git a/spec/normalizer_spec.rb b/spec/normalizer_spec.rb new file mode 100644 index 0000000..a839551 --- /dev/null +++ b/spec/normalizer_spec.rb @@ -0,0 +1,33 @@ +# require 'spec_helper' +require_relative '../normalizer' + +describe Normalizer do + + describe "#format_timestamp" do + it 'should return a timestamp formated for ISO-8601' do + + end + end + + describe '#format_zip' do + it 'formats zipcode with preceeding zeros when under 5 digits in length' do + + end + end + describe '#format_fullname' do + it 'formats fullname with all uppercase' do + + end + end + describe '#format_address' do + it 'runs unicode validation on address' do + + end + end + describe '#format_duration' do + it 'converts duration to total seconds' do + + end + end + +end \ No newline at end of file