From 349e717b17a4613b62ea67042e958e9d02b93f84 Mon Sep 17 00:00:00 2001
From: Pete Warden <pete@jetpac.com>
Date: Thu, 23 May 2013 18:33:58 -0700
Subject: [PATCH] Work on analyzing baby names

---
 dataconversion/analyzebabynames.rb | 86 ++++++++++++++++++++++++++++++
 dstk.xcodeproj/project.pbxproj     |  2 +
 sql/loadnames.sql                  | 17 ++++++
 views/developerdocs.haml           |  2 +-
 4 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100755 dataconversion/analyzebabynames.rb
 create mode 100755 sql/loadnames.sql
diff --git a/dataconversion/analyzebabynames.rb b/dataconversion/analyzebabynames.rb
new file mode 100755
index 0000000..e3b902b
--- /dev/null
+++ b/dataconversion/analyzebabynames.rb
@@ -0,0 +1,86 @@
+#!/usr/bin/env ruby
+
+require 'rubygems'
+
+require 'json'
+
+START_YEAR = 1880
+END_YEAR = 2080
+NUMBER_OF_YEARS = (END_YEAR - START_YEAR)
+
+def output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts)
+
+  count = (previous_male_count + previous_female_count)
+  male_to_female_ratio = (previous_male_count.to_f / previous_female_count.to_f)
+
+  median_year = nil
+  earliest_year = nil
+  latest_year = nil
+  running_total = 0
+  previous_year_counts.each_with_index do |value, offset_year|
+    year = (START_YEAR + offset_year)
+    new_running_total = running_total + value
+    percentile_05 = (count * 0.05)
+    if !earliest_year and
+      running_total < percentile_05 and
+      new_running_total >= percentile_05
+      earliest_year = year
+    end
+    percentile_50 = (count * 0.5)
+    if !median_year and
+      running_total < percentile_50 and
+      new_running_total >= percentile_50
+      median_year = year
+    end
+    percentile_95 = (count * 0.95)
+    if !latest_year and
+      running_total < percentile_95 and
+      new_running_total >= percentile_95
+      latest_year = year
+    end
+    running_total = new_running_total
+  end
+
+  puts [
+    name,
+    count,
+    gender_ratio,
+    median_year,
+    earliest_year,
+    latest_year,
+  ].join(',')
+end
+
+previous_name = nil
+previous_male_count = 0
+previous_female_count = 0
+previous_year_counts = Array.new(NUMBER_OF_YEARS)
+
+$stdin.each_line do |line|
+  row = line.split(',')
+  name = row[0]
+  gender = row[1]
+  count = row[2]
+  filename = row[3]
+  year = filename.gsub(/yob([0-9]+)\.txt/, '\1').to_i
+
+  if name != previous_name
+    if previous_name
+      output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts)
+    end
+    previous_name = name
+    previous_male_count = 0
+    previous_female_count = 0
+    previous_year_counts = Array.new(NUMBER_OF_YEARS)
+  end
+
+  if gender == 'M'
+    previous_male_count += count
+  else
+    previous_female_count += count
+  end
+  offset_year = (year - START_YEAR)
+  previous_year_counts[offset_year] += count
+end
+
+output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts)
diff --git a/dstk.xcodeproj/project.pbxproj b/dstk.xcodeproj/project.pbxproj
index 316621a..06acd22 100644
--- a/dstk.xcodeproj/project.pbxproj
+++ b/dstk.xcodeproj/project.pbxproj
@@ -78,6 +78,7 @@
 		59F0A29213525B4C00098746 /* test_street2coordinates.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = test_street2coordinates.py; sourceTree = "<group>"; };
 		59F0A293135265BF00098746 /* buildukindexes.sql */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = buildukindexes.sql; sourceTree = "<group>"; };
 		59F85DBE174DD08500DD695F /* loadnames.sql */ = {isa = PBXFileReference; lastKnownFileType = text; path = loadnames.sql; sourceTree = "<group>"; };
+		59F85DC0174EF4CF00DD695F /* analyzebabynames.rb */ = {isa = PBXFileReference; lastKnownFileType = text.script.ruby; path = analyzebabynames.rb; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXGroup section */
@@ -259,6 +260,7 @@
 		59D0BE0A17247F02007AAE04 /* dataconversion */ = {
 			isa = PBXGroup;
 			children = (
+				59F85DC0174EF4CF00DD695F /* analyzebabynames.rb */,
 				59D0BE0E1725F99B007AAE04 /* analyzepostcodes.rb */,
 				59D0BE0C172492E1007AAE04 /* createpostcodes.rb */,
 				59D0BE0B17247F32007AAE04 /* simplegeo2postcodes.rb */,
diff --git a/sql/loadnames.sql b/sql/loadnames.sql
new file mode 100755
index 0000000..df315e6
--- /dev/null
+++ b/sql/loadnames.sql
@@ -0,0 +1,17 @@
+CREATE TABLE ethnicity_of_surnames(
+  name CHAR(32) PRIMARY KEY,
+  rank INT,
+  count INT,
+  prop100k FLOAT,
+  cum_prop100k FLOAT,
+  pctwhite FLOAT,
+  pctblack FLOAT,
+  pctapi FLOAT,
+  pctaian FLOAT,
+  pct2prace FLOAT,
+  pcthispanic FLOAT
+);
+
+COPY ethnicity_of_surnames(name, rank, count, prop100k, cum_prop100k, pctwhite, pctblack, pctapi, pctaian, pct2prace, pcthispanic)
+  FROM '/home/ubuntu/sources/dstkdata/ethnicityofsurnames.csv'
+  WITH DELIMITER AS ',' CSV HEADER;
\ No newline at end of file
diff --git a/views/developerdocs.haml b/views/developerdocs.haml
index 3ed9456..c251d8c 100644
--- a/views/developerdocs.haml
+++ b/views/developerdocs.haml
@@ -585,7 +585,7 @@
 
       The text is passed in either as a JSON-encoded array in the remainder of the URL for a GET call, or in the raw body of the request for POST. The usual 8,000 character limit on GET URLs applies, so use POST for larger texts.
 
-      This API uses my Ruby port of Eamon Daly and Jon Orwant's original [GenderFromName Perl module](http://search.cpan.org/~edaly/Text-GenderFromName-0.32/GenderFromName.pm) to classify first names.
+      This API uses my Ruby port of Eamon Daly and Jon Orwant's original [GenderFromName Perl module](http://search.cpan.org/~edaly/Text-GenderFromName-0.32/GenderFromName.pm) to classify first names. It also uses information from [the US Social Security Administration's analysis of baby names](http://www.ssa.gov/oact/babynames/limits.html).
 
     %a{:name=>"text2times"}
     :markdown