-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_occurrences.pl
executable file
·80 lines (68 loc) · 2.32 KB
/
make_occurrences.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Data::Dumper;
use Getopt::Long;
use Archive::Zip;
use MY::Schema::Synonyms;
use File::Temp qw(tempfile);
# extracts a DarwinCore archive from GBIF, exports selected columns from
# the occurrence.txt file as well as a normalized taxa table.
# process command line arguments
my $infile; # zip file downloaded from GBIF
my $outfile; # simplified occurrences as TSV
my $sdb; # GBIF backbone, i.e. https://www.dropbox.com/s/kxfnag9k1imb0le/gbif-backbone-synonyms.db?dl=0
GetOptions(
'infile=s' => \$infile,
'outfile=s' => \$outfile,
'sdb=s' => \$sdb,
);
my $syn = MY::Schema::Synonyms->connect( "dbi:SQLite:$sdb" );
# columns to retain
my %columns = (
'gbifID' => 'gbif_id', # integer (pk)
'type' => 'occurrence_type', # text
'basisOfRecord' => 'basis_of_record', # text (index)
'eventDate' => 'event_date', # text ISO-8601 date
'decimalLatitude' => 'decimal_latitude', # real
'decimalLongitude' => 'decimal_longitude', # real
'datasetKey' => 'dataset_key', # text (UUID)
'hasGeospatialIssues' => 'has_geospatial_issues', # boolean (true/false), transform to 0/1
);
# extract occurrences from archive
my $zip = Archive::Zip->new($infile);
my ( $wfh, $tempfile ) = tempfile();
close $wfh;
$zip->extractMember( 'occurrence.txt' => $tempfile );
# start reading occurrences and writing output
open my $in, '<', $tempfile or die $!;
open my $out, '>', $outfile or die $!;
my ( @header, @transformed, %labels );
LINE: while(<$in>) {
chomp;
my @line = split /\t/, $_;
# process header
if ( not @header ) {
@header = @line;
# print header
my @transformed = map { $columns{$_} } grep { $columns{$_} } @header;
print $out join("\t", @transformed, 'label'), "\n";
}
# write record
else {
my %record = map { $header[$_] => $line[$_] } 0 .. $#header;
# transform boolean words to 1/0
$record{'hasGeospatialIssues'} = $record{'hasGeospatialIssues'} eq 'true' ? 1 : 0;
# create label
my $key = $record{'taxonKey'};
if ( not $labels{$key} ) {
$labels{$key} = $syn->resultset('Longname')->find($key)->completename;
}
my $label = $labels{$key};
# write output
my @values = map { $record{$_} } grep { $columns{$_} } @header;
print $out join("\t", @values, $label), "\n";
}
}
unlink $tempfile;