In [78]:
#Given a year 2009
# we have N cells 
# Each cell contains information [{sst,wind_dir, cholorphyll, index, month, month_index}] (array is sorted by month)


In [68]:
import math
import json
import csv
import re
import ast
from pprint import pprint
import pandas as pd
from datetime import datetime

In [69]:
def drange(start, stop, step):
    r = start
    while r < stop:
        yield r
        r += step

In [70]:
def speciesMap():
    speciesmap = {}
    firstspeciesrow = ""
    with open("../data/species_codes.csv") as codefile:
        codereader = csv.reader(codefile, delimiter=",")
        linecount = 0
        for row in codereader:
            if linecount == 0:
                linecount = linecount + 1
                firstspeciesrow = row
                continue
            group = row[3]
            speciesmap[str(row[1])] = str(group).lower()
        speciesmap["OTHER ROCKFISH YOY"] = "other rockfish"
        speciesmap['WOLF EEL YOY'] = "other rockfish"
        speciesmap['PAINTED GREENLING YOY'] = "other rockfish"
        speciesmap['CURLFIN SOLE'] = "flatfish"
        speciesmap['PACIFIC ARGENTINE'] = "fish"
    return speciesmap

In [71]:
def readTopLevelCodes():
    data = []
    filename = "../data/speciesClass.json"
    with open(filename) as f:
        data = json.load(f)
    return data

In [72]:
def readTopLevelColors():
    data = []
    filename = "../data/speciesColor.json"
    with open(filename) as f:
        data = json.load(f)
    return data

In [73]:
def readSSTData(year):
    data = []
    filename = "../data/sstmay" + str(year) + '.json'
    with open(filename) as f:
        data = json.load(f)
    return data

In [74]:
def readCholorData(year):
    data = []
    filename = "../data/chlorophyllmay" + str(year) + '.json'
    with open(filename) as f:
        data = json.load(f)
    return data

In [75]:
def readWindData():
    data = []
    filename = "../data/wind.json"
    with open(filename) as f:
        data = json.load(f)
    return data
    

In [76]:
def initMatrix(yearRange):
    matrix = []
    smallestLat = 35.775
    largestLat = 39.15
    smallestLon = 235.5625
    largestLon = 238.9375
    latRange = largestLat - smallestLat
    lonRange = largestLon - smallestLon
    delta = 0.0125
    delta = delta * 27
    latInterval = drange(smallestLat, largestLat, delta)
    latInc = [float("{0:.4f}".format(x)) for x in latInterval]

    lonInterval = drange(smallestLon, largestLon, delta)
    lonInc = [float("{0:.4f}".format(x)) for x in lonInterval]
    lonInc.append(largestLon)
    
    for lat_i, lat_ in enumerate(latInc):
        if(lat_i+1 < len(latInc)):
            for lon_i,lon_ in enumerate(lonInc):
                if(lon_i+1 < len(lonInc)):
                    latRnge =[lat_,latInc[lat_i+1]]
                    lonRnge =[lon_,lonInc[lon_i+1]]
                    cell = {'latRange': latRnge, 'lonRange': lonRnge}
                    for year in yearRange:
                        cell[year] = {'sst':-9999,'windDegree':-9999, 'chloro': -9999}
                    matrix.append(cell)
    return matrix
    

In [120]:
def diversityDataByYear(year):
    speciesmap = speciesMap()
    topLevel = readTopLevelCodes()
    colors = readTopLevelColors()
    firsthaulrow = ""
    datadict = {}
    sol = []
    with open("../data/haul_catch_years_clean.csv") as haulfile:
        reader = csv.reader(haulfile, delimiter=",")
        firsthaulrow = next(reader, None)
        for row in reader:
            date = row[2]
            monthmatch = re.search(r'\d+', date)
            yearmatch = re.search(r'\d{4}', date)
            monthvalue = monthmatch.group()
            yearvalue = yearmatch.group()
            if monthvalue == "5" and yearvalue == str(year):
                for k in range(3, len(row)):
                    if row[k] != '':
                        value = ast.literal_eval(row[k])
                        if value > 0:
                            speciesTrueName = firsthaulrow[k]
                            noaaName = speciesmap[firsthaulrow[k]]
                            simplifiedName = topLevel[noaaName]
                            if speciesTrueName in datadict:
                                datadict[speciesTrueName]['count'] = datadict[speciesTrueName]['count'] + value
                            else:
                                datadict[speciesTrueName] = {'name': speciesTrueName,'year':year, 'group':simplifiedName  ,'count':value,'colors':colors[simplifiedName]}
   
    sol = []
    for key, value in datadict.items():
        sol.append(value)
    sol.sort(key=lambda x: x['group'], reverse=False)
    return sol

In [125]:
sstYears = [2011,2012,2013,2014,2015]
stats = []
for year in sstYears:
    data = diversityDataByYear(year)
    filename = "abundance-data-" + str(year) + ".json"
    with open(filename, 'w') as outfile:
        json.dump(data, outfile)
# print(stats)

In [77]:
# latRange and lonRange match merge data
# map concated data to new species :o
# map to matrixObject (saves time for processing step...)
def populationForCell(sol):
    keyDump = json.loads(sol[0])
    keys = []
    for key, values in keyDump.items() :
        keys.append(key)

    cellIDs = {}
    for row in sol:
        pops = json.loads(row)
        id_ = pops["cellID"]
        if id_ in cellIDs:
            for key in keys:
                if key not in ["latRange","lonRange","cellID","lat","lon"]:
                    cellIDs[id_][key] = cellIDs[id_][key] + pops[key]
        else:
            cellIDs[id_] = pops

    rTLC = readTopLevelCodes()
    levelOne = [] 
    for key, value in rTLC.items():
        levelOne.append(value) 
    levelOne = list(set(levelOne))
    
    speciesmap = speciesMap()
    levelTwo = []
    for key, value in speciesmap.items():
        levelTwo.append(value) 
    levelTwo = list(set(levelTwo))
    
    species = []
    for key, value in cellIDs.items():
        species.append(value) 
        
    for val in species:
        val['uniqueSpecies'] = list(set(val['uniqueSpecies']))
        val['uniqueSpeciesCount'] = len(val['uniqueSpecies'])
        val['levelTwo'] = levelTwo
        val['levelOne'] = levelOne
        levelOneMap = {}
        for spec in levelTwo:
            if rTLC[spec] in levelOneMap:
                levelOneMap[rTLC[spec]] = levelOneMap[rTLC[spec]] + val[spec]
            else:
                levelOneMap[rTLC[spec]] = val[spec]
        val['levelOneMap'] = levelOneMap
    return species

In [94]:
def processRow(row, firsthaulrow,datadict,year):
    date = row[2]
    monthmatch = re.search(r'\d+', date)
    yearmatch = re.search(r'\d{4}', date)
    monthvalue = monthmatch.group()
    yearvalue = yearmatch.group()
    speciesmap = speciesMap()
    topLevel = readTopLevelCodes()
    colors = readTopLevelColors()

    if monthvalue == "5" and yearvalue == str(year):
        datadict["uniqueSpecies"] = []
        for k in range(3, len(row)):
            if row[k] != '':
                value = ast.literal_eval(row[k])
                if value > 0:
                    speciesTrueName = firsthaulrow[k]
                    noaaName = speciesmap[firsthaulrow[k]]
                    simplifiedName = topLevel[noaaName]
                    if(simplifiedName in datadict["UniqueCntForSimple"]):
                        datadict["UniqueCntForSimple"][simplifiedName].append(firsthaulrow[k])
                    else:
                        datadict["UniqueCntForSimple"][simplifiedName] = [firsthaulrow[k]]
                    datadict["uniqueSpecies"].append(firsthaulrow[k])

In [95]:
def processPopulation(year):
    smallestLat = 35.775
    largestLat = 39.15
    smallestLon = 235.5625
    largestLon = 238.9375
    latRange = largestLat - smallestLat
    lonRange = largestLon - smallestLon
    delta = 0.0125
    delta = delta * 27
    latInterval = drange(smallestLat, largestLat, delta)
    latInc = [float("{0:.4f}".format(x)) for x in latInterval]

    lonInterval = drange(smallestLon, largestLon, delta)
    lonInc = [float("{0:.4f}".format(x)) for x in lonInterval]
    lonInc.append(largestLon)
    
    speciesmap = speciesMap()
    firsthaulrow = ""
    data = []
    with open("../data/haul_catch_years_clean.csv") as haulfile:
        reader = csv.reader(haulfile, delimiter=",")
        firsthaulrow = next(reader, None)
        for row in reader:
            data.append(row)
    popData = []
    for i, lat_ in enumerate(latInc):
        if(i+1 < len(latInc)):
            for j,lon_ in enumerate(lonInc):
                if(j+1 < len(lonInc)):
                    datadict = {"UniqueCntForSimple":{}}
                    datadict['cellID'] = str(i) + str(j)
                    for row in data:
                        lat = ast.literal_eval(row[0])
                        lon = 360-ast.literal_eval(row[1])
                        if (lat >= lat_ and lat < latInc[i+1]) and (lon >= lon_ and lon < lonInc[j+1]):
                            processRow(row, firsthaulrow, datadict,year)
                    sol = {}        
                    for key, value in datadict['UniqueCntForSimple'].items():
                        sol[key] = len(list(set(value)))
                    if sol:
                        sol['year'] = year
                        sol['cellID'] = datadict['cellID']
                        popData.append(sol)
    return popData

In [80]:
#extend this to handle days for that month.
# e.g sst:(float) -> sst:[float,float,...] (sorted by day)
def processFile(years,matrix):
    for year in years:
        sstData = readSSTData(year)
        rows = sstData["table"]["rows"]
        for idx, cell in enumerate(matrix):
            cell = matrix[idx]
            totalIntTemp = 0
            count = 0
            avgIntTemp = 0
            temps = []
            for row in rows:
                lat = row[2]
                lon = row[3]
                sst = row[4]
                if((cell['latRange'][0] <= lat < cell['latRange'][1] ) and (cell['lonRange'][0] <= lon < cell['lonRange'][1])):
                    if(sst != None and sst != 0):
                        count = count + 1
                        totalIntTemp = totalIntTemp + sst
                    else:
                        sst = -9999
                    temps.append(sst)
            if count != 0:
                avgIntTemp = totalIntTemp/count
                cell[year]['sst'] = float("{0:.4f}".format(avgIntTemp))
            cell[year]['sstHD'] = temps
        
    return matrix

In [81]:
def processWindData(matrix):
    windData = readWindData()
    windRows = windData['table']['rows']
    for idx, cell in enumerate(matrix):
        cell = matrix[idx]
        for wRow in windRows:
            datetime_object = datetime.strptime(wRow[0], '%Y-%m-%dT%H:%M:%SZ')
            year = datetime_object.year
            lat = wRow[2]
            lon = wRow[3]
            xWind = wRow[4]
            yWind = wRow[5]
            if((cell['latRange'][0] <= lat < cell['latRange'][1] ) and (cell['lonRange'][0] <= lon < cell['lonRange'][1])):
                if(xWind != None and yWind != None):
                    cell[year]['windDegree'] = wind_degree = math.degrees(math.atan(yWind/xWind))
    return matrix

In [82]:
def processChlorophyll(years,matrix):
    for year in years:
        cholorData = readCholorData(year)
        rows = cholorData["table"]["rows"]
        for idx, cell in enumerate(matrix):
            cell = matrix[idx]
            totalIntTemp = 0
            count = 0
            avgIntTemp = 0
            for row in rows:
                lat = row[2]
                lon = row[3]
                chloro = row[4]
                if((cell['latRange'][0] <= lat < cell['latRange'][1] ) and (cell['lonRange'][0] <= lon < cell['lonRange'][1])):
                    if(chloro != None ):
                        count = count + 1
                        totalIntTemp = totalIntTemp + chloro
            if count != 0:
                avgIntTemp = totalIntTemp/count
                cell[year]['chloro'] = float("{0:.4f}".format(avgIntTemp))
        
    return matrix

In [92]:
def calculateAverage(data):
    #given data for that year for each species... sum up  divide by cells
    level2Totals = {}
    level1Totals = {}

    for d in data:
        for l1_name in d['levelOne']:
            if l1_name in level1Totals:
                level1Totals[l1_name]['sum'] = level1Totals[l1_name]['sum'] + d['levelOneMap'][l1_name]
                level1Totals[l1_name]['count'] = level1Totals[l1_name]['count'] + 1
                level1Totals[l1_name]['average'] = level1Totals[l1_name]['sum'] / level1Totals[l1_name]['count']
            else:
                level1Totals[l1_name] = {'sum':0,'count':0,'average':0}
                
        
        for l2_name in d['levelTwo']:
            if l2_name in level2Totals:
                level2Totals[l2_name]['sum'] = level2Totals[l2_name]['sum'] + d[l2_name]
                level2Totals[l2_name]['count'] = level2Totals[l2_name]['count'] + 1
                level2Totals[l2_name]['average'] = level2Totals[l2_name]['sum'] / level2Totals[l2_name]['count']
            else:
                level2Totals[l2_name] = {'sum':0,'count':0,'average':0}
    
    return [level1Totals,level2Totals]

In [83]:
def processPops(matrix,years):
    yearAvg = {}
    for year in years:
        popData = processPopulation(year)
#         yearlyAverage = calculateAverage(popData)
#         yearAvg[year] = {'L1':yearlyAverage[0],'L2':yearlyAverage[1]}
        for data in popData:
            matrix[int(data['cellID'])][year]['popInfo'] = data
    return matrix

In [105]:
#run this function for all 
years = [2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
sstYears = [2009,2010,2011,2012,2013,2014,2015]
matrix = initMatrix(years)

In [106]:
len(matrix)

100

In [107]:
matrix = processFile(sstYears,matrix)

In [108]:
matrix = processWindData(matrix) #for all years
matrix = processChlorophyll(sstYears,matrix)

In [109]:
newMatrix = matrix

In [110]:
temp = processPops(newMatrix,sstYears)

In [111]:
newMatrix[26]

{2009: {'chloro': 11.9485,
  'popInfo': {'Cephalopod': 5,
   'Crustacean': 1,
   'Forage': 5,
   'Ground Fish': 5,
   'Krill': 1,
   'Rockfish': 12,
   'cellID': '26',
   'year': 2009},
  'sst': 11.5989,
  'sstHD': [11.608572,
   11.715714,
   11.758571,
   11.727857,
   11.7628565,
   11.682143,
   11.518126,
   11.4875,
   11.476875,
   11.455001,
   11.454286,
   11.576874,
   11.569375,
   11.493126,
   11.690001,
   11.771428,
   11.570625,
   11.597502,
   11.513124,
   11.514376,
   11.46375,
   11.397501,
   11.372499,
   11.35875,
   11.30375,
   11.37,
   11.373125,
   11.8375,
   11.665,
   11.678572,
   11.690001,
   11.707143,
   11.739286,
   11.455,
   11.509375,
   11.639375,
   11.541249,
   11.57,
   11.633126,
   11.523125,
   11.485624,
   11.694376,
   11.602501,
   11.591249,
   11.595625,
   11.615625,
   11.460625,
   11.37875,
   11.428125,
   11.4725,
   11.478125,
   11.478125,
   11.468125,
   11.438749,
   11.742501,
   11.8225,
   11.673571,
   11.675,
   

In [116]:
with open('abundance-data.json', 'w') as outfile:
    json.dump(stats, outfile)