From 349e717b17a4613b62ea67042e958e9d02b93f84 Mon Sep 17 00:00:00 2001 From: Pete Warden Date: Thu, 23 May 2013 18:33:58 -0700 Subject: [PATCH] Work on analyzing baby names --- dataconversion/analyzebabynames.rb | 86 ++++++++++++++++++++++++++++++ dstk.xcodeproj/project.pbxproj | 2 + sql/loadnames.sql | 17 ++++++ views/developerdocs.haml | 2 +- 4 files changed, 106 insertions(+), 1 deletion(-) create mode 100755 dataconversion/analyzebabynames.rb create mode 100755 sql/loadnames.sql diff --git a/dataconversion/analyzebabynames.rb b/dataconversion/analyzebabynames.rb new file mode 100755 index 0000000..e3b902b --- /dev/null +++ b/dataconversion/analyzebabynames.rb @@ -0,0 +1,86 @@ +#!/usr/bin/env ruby + +require 'rubygems' + +require 'json' + +START_YEAR = 1880 +END_YEAR = 2080 +NUMBER_OF_YEARS = (END_YEAR - START_YEAR) + +def output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts) + + count = (previous_male_count + previous_female_count) + male_to_female_ratio = (previous_male_count.to_f / previous_female_count.to_f) + + median_year = nil + earliest_year = nil + latest_year = nil + running_total = 0 + previous_year_counts.each_with_index do |value, offset_year| + year = (START_YEAR + offset_year) + new_running_total = running_total + value + percentile_05 = (count * 0.05) + if !earliest_year and + running_total < percentile_05 and + new_running_total >= percentile_05 + earliest_year = year + end + percentile_50 = (count * 0.5) + if !median_year and + running_total < percentile_50 and + new_running_total >= percentile_50 + median_year = year + end + percentile_95 = (count * 0.95) + if !latest_year and + running_total < percentile_95 and + new_running_total >= percentile_95 + latest_year = year + end + running_total = new_running_total + end + + puts [ + name, + count, + gender_ratio, + median_year, + earliest_year, + latest_year, + ].join(',') +end + +previous_name = nil +previous_male_count = 0 +previous_female_count = 0 +previous_year_counts = Array.new(NUMBER_OF_YEARS) + +$stdin.each_line do |line| + row = line.split(',') + name = row[0] + gender = row[1] + count = row[2] + filename = row[3] + year = filename.gsub(/yob([0-9]+)\.txt/, '\1').to_i + + if name != previous_name + if previous_name + output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts) + end + previous_name = name + previous_male_count = 0 + previous_female_count = 0 + previous_year_counts = Array.new(NUMBER_OF_YEARS) + end + + if gender == 'M' + previous_male_count += count + else + previous_female_count += count + end + offset_year = (year - START_YEAR) + previous_year_counts[offset_year] += count +end + +output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts) diff --git a/dstk.xcodeproj/project.pbxproj b/dstk.xcodeproj/project.pbxproj index 316621a..06acd22 100644 --- a/dstk.xcodeproj/project.pbxproj +++ b/dstk.xcodeproj/project.pbxproj @@ -78,6 +78,7 @@ 59F0A29213525B4C00098746 /* test_street2coordinates.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = test_street2coordinates.py; sourceTree = ""; }; 59F0A293135265BF00098746 /* buildukindexes.sql */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = buildukindexes.sql; sourceTree = ""; }; 59F85DBE174DD08500DD695F /* loadnames.sql */ = {isa = PBXFileReference; lastKnownFileType = text; path = loadnames.sql; sourceTree = ""; }; + 59F85DC0174EF4CF00DD695F /* analyzebabynames.rb */ = {isa = PBXFileReference; lastKnownFileType = text.script.ruby; path = analyzebabynames.rb; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXGroup section */ @@ -259,6 +260,7 @@ 59D0BE0A17247F02007AAE04 /* dataconversion */ = { isa = PBXGroup; children = ( + 59F85DC0174EF4CF00DD695F /* analyzebabynames.rb */, 59D0BE0E1725F99B007AAE04 /* analyzepostcodes.rb */, 59D0BE0C172492E1007AAE04 /* createpostcodes.rb */, 59D0BE0B17247F32007AAE04 /* simplegeo2postcodes.rb */, diff --git a/sql/loadnames.sql b/sql/loadnames.sql new file mode 100755 index 0000000..df315e6 --- /dev/null +++ b/sql/loadnames.sql @@ -0,0 +1,17 @@ +CREATE TABLE ethnicity_of_surnames( + name CHAR(32) PRIMARY KEY, + rank INT, + count INT, + prop100k FLOAT, + cum_prop100k FLOAT, + pctwhite FLOAT, + pctblack FLOAT, + pctapi FLOAT, + pctaian FLOAT, + pct2prace FLOAT, + pcthispanic FLOAT +); + +COPY ethnicity_of_surnames(name, rank, count, prop100k, cum_prop100k, pctwhite, pctblack, pctapi, pctaian, pct2prace, pcthispanic) + FROM '/home/ubuntu/sources/dstkdata/ethnicityofsurnames.csv' + WITH DELIMITER AS ',' CSV HEADER; \ No newline at end of file diff --git a/views/developerdocs.haml b/views/developerdocs.haml index 3ed9456..c251d8c 100644 --- a/views/developerdocs.haml +++ b/views/developerdocs.haml @@ -585,7 +585,7 @@ The text is passed in either as a JSON-encoded array in the remainder of the URL for a GET call, or in the raw body of the request for POST. The usual 8,000 character limit on GET URLs applies, so use POST for larger texts. - This API uses my Ruby port of Eamon Daly and Jon Orwant's original [GenderFromName Perl module](http://search.cpan.org/~edaly/Text-GenderFromName-0.32/GenderFromName.pm) to classify first names. + This API uses my Ruby port of Eamon Daly and Jon Orwant's original [GenderFromName Perl module](http://search.cpan.org/~edaly/Text-GenderFromName-0.32/GenderFromName.pm) to classify first names. It also uses information from [the US Social Security Administration's analysis of baby names](http://www.ssa.gov/oact/babynames/limits.html). %a{:name=>"text2times"} :markdown