In [1]:
#Given a year 2009
# we have N cells 
# Each cell contains information [{sst,wind_dir, cholorphyll, index, month, month_index}] (array is sorted by month)


In [2]:
import math
import json
import csv
import re
import ast
from pprint import pprint
import pandas as pd
from datetime import datetime

In [3]:
def drange(start, stop, step):
    r = start
    while r < stop:
        yield r
        r += step

In [4]:
def speciesMap():
    speciesmap = {}
    firstspeciesrow = ""
    with open("../data/species_codes.csv") as codefile:
        codereader = csv.reader(codefile, delimiter=",")
        linecount = 0
        for row in codereader:
            if linecount == 0:
                linecount = linecount + 1
                firstspeciesrow = row
                continue
            group = row[3]
            speciesmap[str(row[1])] = str(group).lower()
        speciesmap["OTHER ROCKFISH YOY"] = "other rockfish"
        speciesmap['WOLF EEL YOY'] = "other rockfish"
        speciesmap['PAINTED GREENLING YOY'] = "other rockfish"
        speciesmap['CURLFIN SOLE'] = "flatfish"
        speciesmap['PACIFIC ARGENTINE'] = "fish"
    return speciesmap

In [5]:
def readTopLevelCodes():
    data = []
    filename = "../data/speciesClass.json"
    with open(filename) as f:
        data = json.load(f)
    return data

In [6]:
def readSSTData(year):
    data = []
    filename = "../data/sstmay" + str(year) + '.json'
    with open(filename) as f:
        data = json.load(f)
    return data

In [7]:
def readCholorData(year):
    data = []
    filename = "../data/chlorophyllmay" + str(year) + '.json'
    with open(filename) as f:
        data = json.load(f)
    return data

In [8]:
def readWindData():
    data = []
    filename = "../data/wind.json"
    with open(filename) as f:
        data = json.load(f)
    return data
    

In [9]:
def initMatrix(yearRange):
    matrix = []
    smallestLat = 35.775
    largestLat = 39.15
    smallestLon = 235.5625
    largestLon = 238.9375
    latRange = largestLat - smallestLat
    lonRange = largestLon - smallestLon

    latStep = latRange/11
    lonStep = lonRange/11

    latInterval = drange(smallestLat, largestLat, latStep)
    latInc = [float("{0:.4f}".format(x)) for x in latInterval]
    latInc = latInc[:-1]

    lonInterval = drange(smallestLon, largestLon, lonStep)
    lonInc = [float("{0:.4f}".format(x)) for x in lonInterval]
    for lat_i, lat_ in enumerate(latInc):
        if(lat_i+1 < len(latInc)):
            for lon_i,lon_ in enumerate(lonInc):
                if(lon_i+1 < len(lonInc)):
                    latRnge =[lat_,latInc[lat_i+1]]
                    lonRnge =[lon_,lonInc[lon_i+1]]
                    cell = {'latRange': latRnge, 'lonRange': lonRnge}
                    for year in yearRange:
                        cell[year] = {'sst':-9999,'windDegree':-9999, 'chloro': -9999}
                    matrix.append(cell)
        
    return matrix
    

In [10]:
# latRange and lonRange match merge data
# map concated data to new species :o
# map to matrixObject (saves time for processing step...)
def populationForCell(sol):
    keyDump = json.loads(sol[0])
    keys = []
    for key, values in keyDump.items() :
        keys.append(key)

    cellIDs = {}
    for row in sol:
        pops = json.loads(row)
        id_ = pops["cellID"]
        if id_ in cellIDs:
            for key in keys:
                if key not in ["latRange","lonRange","cellID","lat","lon"]:
                    cellIDs[id_][key] = cellIDs[id_][key] + pops[key]
        else:
            cellIDs[id_] = pops

    rTLC = readTopLevelCodes()
    levelOne = [] 
    for key, value in rTLC.items():
        levelOne.append(value) 
    levelOne = list(set(levelOne))
    
    speciesmap = speciesMap()
    levelTwo = []
    for key, value in speciesmap.items():
        levelTwo.append(value) 
    levelTwo = list(set(levelTwo))
    
    species = []
    for key, value in cellIDs.items():
        species.append(value) 
        
    for val in species:
        val['uniqueSpecies'] = list(set(val['uniqueSpecies']))
        val['uniqueSpeciesCount'] = len(val['uniqueSpecies'])
        val['levelTwo'] = levelTwo
        val['levelOne'] = levelOne
        levelOneMap = {}
        for spec in levelTwo:
            if rTLC[spec] in levelOneMap:
                levelOneMap[rTLC[spec]] = levelOneMap[rTLC[spec]] + val[spec]
            else:
                levelOneMap[rTLC[spec]] = val[spec]
        val['levelOneMap'] = levelOneMap
    return species

In [11]:
def processPopulation(year):
    smallestLat = 35.775
    largestLat = 39.15
    smallestLon = 235.5625
    largestLon = 238.9375
    latRange = largestLat - smallestLat
    lonRange = largestLon - smallestLon

    latStep = latRange/11
    lonStep = lonRange/11
    
    latInterval = drange(smallestLat, largestLat, latStep)
    latInc = [float("{0:.4f}".format(x)) for x in latInterval]
    latInc = latInc[:-1]

    lonInterval = drange(smallestLon, largestLon, lonStep)
    lonInc = [float("{0:.4f}".format(x)) for x in lonInterval]
    
    speciesmap = speciesMap()
    firsthaulrow = ""
    datadict = {}
    sol = []
    with open("../data/haul_catch_years_clean.csv") as haulfile:
        reader = csv.reader(haulfile, delimiter=",")
        firsthaulrow = next(reader, None)
        for row in reader:
            date = row[2]
            monthmatch = re.search(r'\d+', date)
            yearmatch = re.search(r'\d{4}', date)
            monthvalue = monthmatch.group()
            yearvalue = yearmatch.group()
            for i in range(3, len(row)):
                speciescodes = speciesmap[firsthaulrow[i]]
                datadict[speciescodes] = 0

            if monthvalue == "5" and yearvalue == str(year):
                 for i, lat_ in enumerate(latInc):
                    lat = ast.literal_eval(row[0])
                    if(i+1 < len(latInc)):
                        if lat >= lat_ and lat < latInc[i+1]:
                            for j,lon_ in enumerate(lonInc):
                                lon = 360-ast.literal_eval(row[1])
                                if(j+1 < len(lonInc)):
                                    if lon >= lon_ and lon < lonInc[j+1]:
                                        latRnge =[lat_,latInc[i+1]]
                                        lonRnge =[lon_,lonInc[j+1]]
                                        datadict['latRange'] = latRnge
                                        datadict['lonRange'] = lonRnge
                                        datadict['cellID'] = str(i) + str(j)
                                        datadict["lat"] = float("{0:.4f}".format(lat))
                                        datadict["lon"] = float("{0:.4f}".format(lon))
                                        datadict["uniqueSpecies"] = []
                                        for k in range(3, len(row)):
                                            if row[k] != '':
                                                value = ast.literal_eval(row[k])
                                                if value > 0:
                                                    datadict["uniqueSpecies"].append(firsthaulrow[k])
                                                    datadict[speciesmap[firsthaulrow[k]]] += value
                                        datadictjson = json.dumps(datadict)
                                        sol.append(datadictjson)
    #                                 print(datadictjson, end=",\n")
    if len(sol) > 0:
        return populationForCell(sol)
    else:
        return []

In [12]:
#extend this to handle days for that month.
# e.g sst:(float) -> sst:[float,float,...] (sorted by day)
def processFile(years,matrix):
    for year in years:
        sstData = readSSTData(year)
        rows = sstData["table"]["rows"]
        for idx, cell in enumerate(matrix):
            cell = matrix[idx]
            totalIntTemp = 0
            count = 0
            avgIntTemp = 0
            for row in rows:
                lat = row[2]
                lon = row[3]
                sst = row[4]
                if((cell['latRange'][0] <= lat <= cell['latRange'][1] ) and (cell['lonRange'][0] <= lon <= cell['lonRange'][1])):
                    if(sst != None and sst != 0):
                        count = count + 1
                        totalIntTemp = totalIntTemp + sst
            if count != 0:
                avgIntTemp = totalIntTemp/count
                cell[year]['sst'] = float("{0:.4f}".format(avgIntTemp))
        
    return matrix

In [13]:
def processWindData(matrix):
    windData = readWindData()
    windRows = windData['table']['rows']
    for idx, cell in enumerate(matrix):
        cell = matrix[idx]
        for wRow in windRows:
            datetime_object = datetime.strptime(wRow[0], '%Y-%m-%dT%H:%M:%SZ')
            year = datetime_object.year
            lat = wRow[2]
            lon = wRow[3]
            xWind = wRow[4]
            yWind = wRow[5]
            if((cell['latRange'][0] <= lat <= cell['latRange'][1] ) and (cell['lonRange'][0] <= lon <= cell['lonRange'][1])):
                if(xWind != None and yWind != None):
                    cell[year]['windDegree'] = wind_degree = math.degrees(math.atan(yWind/xWind))
    return matrix

In [14]:
def processChlorophyll(years,matrix):
    for year in years:
        cholorData = readCholorData(year)
        rows = cholorData["table"]["rows"]
        for idx, cell in enumerate(matrix):
            cell = matrix[idx]
            totalIntTemp = 0
            count = 0
            avgIntTemp = 0
            for row in rows:
                lat = row[2]
                lon = row[3]
                chloro = row[4]
                if((cell['latRange'][0] <= lat <= cell['latRange'][1] ) and (cell['lonRange'][0] <= lon <= cell['lonRange'][1])):
                    if(chloro != None ):
                        count = count + 1
                        totalIntTemp = totalIntTemp + chloro
            if count != 0:
                avgIntTemp = totalIntTemp/count
                cell[year]['chloro'] = float("{0:.4f}".format(avgIntTemp))
        
    return matrix

In [15]:
def calculateAverage(data):
    #given data for that year for each species... sum up  divide by cells
    level2Totals = {}
    level1Totals = {}

    for d in data:
        for l1_name in d['levelOne']:
            if l1_name in level1Totals:
                level1Totals[l1_name]['sum'] = level1Totals[l1_name]['sum'] + d['levelOneMap'][l1_name]
                level1Totals[l1_name]['count'] = level1Totals[l1_name]['count'] + 1
                level1Totals[l1_name]['average'] = level1Totals[l1_name]['sum'] / level1Totals[l1_name]['count']
            else:
                level1Totals[l1_name] = {'sum':0,'count':0,'average':0}
                
        
        for l2_name in d['levelTwo']:
            if l2_name in level2Totals:
                level2Totals[l2_name]['sum'] = level2Totals[l2_name]['sum'] + d[l2_name]
                level2Totals[l2_name]['count'] = level2Totals[l2_name]['count'] + 1
                level2Totals[l2_name]['average'] = level2Totals[l2_name]['sum'] / level2Totals[l2_name]['count']
            else:
                level2Totals[l2_name] = {'sum':0,'count':0,'average':0}
    
    return [level1Totals,level2Totals]

In [16]:
def processPops(matrix,years):
    yearAvg = {}
    for year in years:
        popData = processPopulation(year)
        yearlyAverage = calculateAverage(popData)
        yearAvg[year] = {'L1':yearlyAverage[0],'L2':yearlyAverage[1]}
        for data in popData:
            matrix[int(data['cellID'])][year]['popInfo'] = data
    return [matrix,yearAvg]

In [184]:
#run this function for all 
years = [2009,2010,2011,2012,2013,2014,2015]
sstYears = [2011,2015]
matrix = initMatrix(years)
matrix = processFile(sstYears,matrix)
matrix = processWindData(matrix) #for all years
matrix = processChlorophyll(sstYears,matrix)


In [186]:
newMatrix = matrix
print(len(matrix))


100


In [187]:
temp = processPops(newMatrix,years)

In [192]:
temp[1]

{2009: {'L1': {'Cephalopod': {'average': 81.5625, 'count': 16, 'sum': 1305},
   'Crustacean': {'average': 407.6875, 'count': 16, 'sum': 6523},
   'Euphausiid': {'average': 252001.9375, 'count': 16, 'sum': 4032031},
   'Fish': {'average': 70.0625, 'count': 16, 'sum': 1121},
   'Gelatinous': {'average': 0.0, 'count': 16, 'sum': 0},
   'Ground Fish': {'average': 304.875, 'count': 16, 'sum': 4878},
   'Jellyfish': {'average': 9.375, 'count': 16, 'sum': 150},
   'Salmonid': {'average': 0.1875, 'count': 16, 'sum': 3},
   'Small Fish': {'average': 809.9375, 'count': 16, 'sum': 12959}},
  'L2': {'cephalopod': {'average': 81.5625, 'count': 16, 'sum': 1305},
   'clupeoid': {'average': 1.0, 'count': 16, 'sum': 16},
   'cottid': {'average': 1.5625, 'count': 16, 'sum': 25},
   'crustacean': {'average': 407.6875, 'count': 16, 'sum': 6523},
   'deep-sea smelt': {'average': 5.6875, 'count': 16, 'sum': 91},
   'elasmobranch': {'average': 0.375, 'count': 16, 'sum': 6},
   'euphausiid': {'average': 25200

In [193]:
with open('average_data.json', 'w') as outfile:
    json.dump(temp[1], outfile)