In [32]:
"""
In this problem set you work with another type of infobox data, audit it,
clean it, come up with a data model, insert it into MongoDB and then run some
queries against your database. The set contains data about Arachnid class
animals.

Your task in this exercise is to parse the file, process only the fields that
are listed in the FIELDS dictionary as keys, and return a list of dictionaries
of cleaned values. 

The following things should be done:
- keys of the dictionary changed according to the mapping in FIELDS dictionary
- trim out redundant description in parenthesis from the 'rdf-schema#label'
  field, like "(spider)"
- if 'name' is "NULL" or contains non-alphanumeric characters, set it to the
  same value as 'label'.
- if a value of a field is "NULL", convert it to None
- if there is a value in 'synonym', it should be converted to an array (list)
  by stripping the "{}" characters and splitting the string on "|". Rest of the
  cleanup is up to you, e.g. removing "*" prefixes etc. If there is a singular
  synonym, the value should still be formatted in a list.
- strip leading and ending whitespace from all fields, if there is any
- the output structure should be as follows:

[ { 'label': 'Argiope',
    'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
    'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
    'name': 'Argiope',
    'synonym': ["One", "Two"],
    'classification': {
                      'family': 'Orb-weaver spider',
                      'class': 'Arachnid',
                      'phylum': 'Arthropod',
                      'order': 'Spider',
                      'kingdom': 'Animal',
                      'genus': None
                      }
  },
  { 'label': ... , }, ...
]

  * Note that the value associated with the classification key is a dictionary
    with taxonomic labels.
"""
import codecs
import csv
import json
import pprint
import re

In [33]:
filename = "../ud032-master/Lesson_4_Problem_Set/01-Preparing_Data/arachnid.csv"

In [34]:
fields = {'rdf-schema#label': 'label',
         'URI': 'uri',
         'rdf-schema#comment': 'description',
         'synonym': 'synonym',
         'name': 'name',
         'family_label': 'family',
         'class_label': 'class',
         'phylum_label': 'phylum',
         'order_label': 'order',
         'kingdom_label': 'kingdom',
         'genus_label': 'genus'}

In [38]:
def process_file(filename, fields):
    process_fields = fields.keys()
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        for i in range(3):
            l = reader.next()
        for line in reader:
            for key in process_fields:
                line[fields[key]] = line[key]
            if "label" in line:
#                line["label"] = line["label"].strip("(spider)").strip("(mites)").strip()
                labelRegex = re.compile(r'\w+\-?\w+\s?\w+\s?\w+')
                line["label"] = labelRegex.search(line["label"]).group()
            if "name" in line:
                if line["name"] == 'NULL':
                    line["name"] = line["label"]
                else:
                    nameRegex = re.compile(r'\w+\s?\w+')
                    line["name"] = nameRegex.search(line["name"]).group()
            if "synonym" in line:
                if line["synonym"] == 'NULL':
                    line['synonym'] = None
                else:
                    synonymRegex = re.compile(r'\w+\.?\s?\(?\w+\)?\s?\&?\s?\w+\s?\(?\w+\s?\w+\)?')
                    line["synonym"] = synonymRegex.findall(line["synonym"])
            for item in ["family", "class", "phylum", "order", "kingdom", "genus"]:
                if line[item] == "NULL":
                    line[item] = None
            newline = {"label": line["label"], 
                       "uri": line["uri"],
                       "description": line["description"],
                       "name": line["name"]  ,
                       "synonym": line["synonym"],
                       "classification": {"family": line["family"],
                                          "class": line["class"],
                                          "phylum": line["phylum"],
                                          "order": line["order"],
                                          "kingdom": line["kingdom"],
                                          "genus": line["genus"]}
                       }
            data.append(newline)
                        
    return data

In [39]:
data = process_file(filename, fields)

In [40]:
pprint.pprint(data[48])

{'classification': {'class': '{Acari|Arachnid}',
                    'family': None,
                    'genus': None,
                    'kingdom': 'Animal',
                    'order': '{Prostigmata|Trombidiformes}',
                    'phylum': 'Arthropod'},
 'description': 'Hydrachnidiae are a group of mites containing over 5000 species found in freshwater habitat.',
 'label': 'Hydrachnidiae',
 'name': 'Hydrachnidiae',
 'synonym': ['Hydracarina', 'Hydrachnellae', 'Hydrachnidia'],
 'uri': 'http://dbpedia.org/resource/Hydrachnidiae'}


###### Doodles below:

In [15]:
with open(filename, "r") as f:
    reader = csv.DictReader(f)
    for i in range(3):
        l = reader.next()
    for line in reader:
        if "synonym" in line:
            if line["synonym"] == 'NULL':
                line['synonym'] = None
            else:
                synonymRegex = re.compile(r'\w+\.?\s?\(?\w+\)?\s?\&?\s?\w+\s?\(?\w+\s?\w+\)?')
                line["synonym"] = synonymRegex.findall(line["synonym"])
            print line["synonym"]

None
None
None
None
None
None
None
None
None
None
None
None
None
None
['Cyrene Peckham & Peckham']
None
None
['Rooseveltia mutilla']
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
['Hydracarina', 'Hydrachnellae', 'Hydrachnidia']
None
None
None
None
None
None
None
None
['H. tamulus concanensis (Pocock 1900)', 'H. tamulus gangeticus (Pocock 1900)', 'H. tamulus gujaratensis (Pocock 1900)', 'H. tamulus sindicus (Pocock 1900)']
None
None
None
['Zealanapis australis (Forster 1951)', 'Platnick & Forster 1989', 'Chasmocephalon australis Forster 1951']
None
['Heterometrus (Chersonesometrus) Couzijn 1978', 'Heterometrus (Gigantometrus) Couzijn 1978', 'Heterometrus (Javanimetrus) Couzijn 1981', 'Heterometrus (Srilankametrus) Couzijn 1981', 'Palamnaeus Thorell 1876']
None
None
None
None
None
None
None
None
None
None
None
None


In [None]:
with open(filename, "r") as f:
    reader = csv.DictReader(f)
    for i in range(3):
        l = reader.next()
    for line in reader:
        if "name" in line:
            if line["name"] == 'NULL':
                line["name"] = None
            else:
                nameRegex = re.compile(r'\w+\s?\w+')
                line["name"] = nameRegex.search(line["name"]).group()
            print line["name"]

In [None]:
with open(filename, "r") as f:
    reader = csv.DictReader(f)
    for i in range(3):
        l = reader.next()
    for line in reader:
        for key in process_fields:
            line[fields[key]] = line[key]
            del line[key]
        
        data.append(line)

In [None]:
data = []

In [5]:
process_fields = fields.keys()

In [28]:
with open(filename, "r") as f:
    reader = csv.DictReader(f)
    for i in range(3):
        l = reader.next()
    for line in reader:
        for key in process_fields:
            line[fields[key]] = line[key]
        if "label" in line:
            print line["label"]
            labelRegex = re.compile(r'\w+\-?\w+\s?\w+\s?\w+')
            line["label"] = labelRegex.search(line["label"]).group()
            print line["label"]

Argiope (spider)
Argiope
Tick
Tick
Pseudoscorpion
Pseudoscorpion
Amblypygi
Amblypygi
Schizomida
Schizomida
Opiliones
Opiliones
Neaetha
Neaetha
Nebridia
Nebridia
Neobrettus
Neobrettus
Neonella
Neonella
Neon (spider)
Neon
Nimbarus
Nimbarus
Noegus
Noegus
Nosferattus
Nosferattus
Nycerella
Nycerella
Ocnotelus
Ocnotelus
Ocrisiona
Ocrisiona
Ogdenia
Ogdenia
Omoedus
Omoedus
Onomastus
Onomastus
Opisthoncana
Opisthoncana
Opisthoncus
Opisthoncus
Orissania
Orissania
Orsima
Orsima
Orvilleus
Orvilleus
Orthrus (spider)
Orthrus
Apopyllus
Apopyllus
Galeodes
Galeodes
Liponyssoides sanguineus
Liponyssoides sanguineus
Sarcoptidae
Sarcoptidae
Pyroglyphidae
Pyroglyphidae
Psoroptidae
Psoroptidae
Argas
Argas
Trombidiformes
Trombidiformes
Sarcoptiformes
Sarcoptiformes
Haemaphysalis
Haemaphysalis
Anystis
Anystis
Eupodes
Eupodes
Eleutherengonides
Eleutherengonides
Parasitengona
Parasitengona
Opilioacarus
Opilioacarus
Parasitus
Parasitus
Heterostigmatina
Heterostigmatina
Abacarus
Abacarus
Androlaelaps
Androlaelaps

In [None]:
data[1]

In [29]:
labelRegex = re.compile(r'\w+\-?\w+\s?\w+\s?\w+')

In [31]:
print len(labelRegex.search("Neon (spider)").group())

4
