Skip to content

Commit

Permalink
Work on analyzing baby names
Browse files Browse the repository at this point in the history
  • Loading branch information
jetpaccomputer committed May 24, 2013
1 parent 9eb53a1 commit 349e717
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 1 deletion.
86 changes: 86 additions & 0 deletions dataconversion/analyzebabynames.rb
@@ -0,0 +1,86 @@
#!/usr/bin/env ruby

require 'rubygems'

require 'json'

START_YEAR = 1880
END_YEAR = 2080
NUMBER_OF_YEARS = (END_YEAR - START_YEAR)

def output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts)

count = (previous_male_count + previous_female_count)
male_to_female_ratio = (previous_male_count.to_f / previous_female_count.to_f)

median_year = nil
earliest_year = nil
latest_year = nil
running_total = 0
previous_year_counts.each_with_index do |value, offset_year|
year = (START_YEAR + offset_year)
new_running_total = running_total + value
percentile_05 = (count * 0.05)
if !earliest_year and
running_total < percentile_05 and
new_running_total >= percentile_05
earliest_year = year
end
percentile_50 = (count * 0.5)
if !median_year and
running_total < percentile_50 and
new_running_total >= percentile_50
median_year = year
end
percentile_95 = (count * 0.95)
if !latest_year and
running_total < percentile_95 and
new_running_total >= percentile_95
latest_year = year
end
running_total = new_running_total
end

puts [
name,
count,
gender_ratio,
median_year,
earliest_year,
latest_year,
].join(',')
end

previous_name = nil
previous_male_count = 0
previous_female_count = 0
previous_year_counts = Array.new(NUMBER_OF_YEARS)

$stdin.each_line do |line|
row = line.split(',')
name = row[0]
gender = row[1]
count = row[2]
filename = row[3]
year = filename.gsub(/yob([0-9]+)\.txt/, '\1').to_i

if name != previous_name
if previous_name
output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts)
end
previous_name = name
previous_male_count = 0
previous_female_count = 0
previous_year_counts = Array.new(NUMBER_OF_YEARS)
end

if gender == 'M'
previous_male_count += count
else
previous_female_count += count
end
offset_year = (year - START_YEAR)
previous_year_counts[offset_year] += count
end

output_row(previous_name, previous_male_count, previous_female_count, previous_year_counts)
2 changes: 2 additions & 0 deletions dstk.xcodeproj/project.pbxproj
Expand Up @@ -78,6 +78,7 @@
59F0A29213525B4C00098746 /* test_street2coordinates.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = test_street2coordinates.py; sourceTree = "<group>"; };
59F0A293135265BF00098746 /* buildukindexes.sql */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = buildukindexes.sql; sourceTree = "<group>"; };
59F85DBE174DD08500DD695F /* loadnames.sql */ = {isa = PBXFileReference; lastKnownFileType = text; path = loadnames.sql; sourceTree = "<group>"; };
59F85DC0174EF4CF00DD695F /* analyzebabynames.rb */ = {isa = PBXFileReference; lastKnownFileType = text.script.ruby; path = analyzebabynames.rb; sourceTree = "<group>"; };
/* End PBXFileReference section */

/* Begin PBXGroup section */
Expand Down Expand Up @@ -259,6 +260,7 @@
59D0BE0A17247F02007AAE04 /* dataconversion */ = {
isa = PBXGroup;
children = (
59F85DC0174EF4CF00DD695F /* analyzebabynames.rb */,
59D0BE0E1725F99B007AAE04 /* analyzepostcodes.rb */,
59D0BE0C172492E1007AAE04 /* createpostcodes.rb */,
59D0BE0B17247F32007AAE04 /* simplegeo2postcodes.rb */,
Expand Down
17 changes: 17 additions & 0 deletions sql/loadnames.sql
@@ -0,0 +1,17 @@
CREATE TABLE ethnicity_of_surnames(
name CHAR(32) PRIMARY KEY,
rank INT,
count INT,
prop100k FLOAT,
cum_prop100k FLOAT,
pctwhite FLOAT,
pctblack FLOAT,
pctapi FLOAT,
pctaian FLOAT,
pct2prace FLOAT,
pcthispanic FLOAT
);

COPY ethnicity_of_surnames(name, rank, count, prop100k, cum_prop100k, pctwhite, pctblack, pctapi, pctaian, pct2prace, pcthispanic)
FROM '/home/ubuntu/sources/dstkdata/ethnicityofsurnames.csv'
WITH DELIMITER AS ',' CSV HEADER;
2 changes: 1 addition & 1 deletion views/developerdocs.haml
Expand Up @@ -585,7 +585,7 @@
The text is passed in either as a JSON-encoded array in the remainder of the URL for a GET call, or in the raw body of the request for POST. The usual 8,000 character limit on GET URLs applies, so use POST for larger texts.
This API uses my Ruby port of Eamon Daly and Jon Orwant's original [GenderFromName Perl module](http://search.cpan.org/~edaly/Text-GenderFromName-0.32/GenderFromName.pm) to classify first names.
This API uses my Ruby port of Eamon Daly and Jon Orwant's original [GenderFromName Perl module](http://search.cpan.org/~edaly/Text-GenderFromName-0.32/GenderFromName.pm) to classify first names. It also uses information from [the US Social Security Administration's analysis of baby names](http://www.ssa.gov/oact/babynames/limits.html).
%a{:name=>"text2times"}
:markdown
Expand Down

0 comments on commit 349e717

Please sign in to comment.