In [1]:
import pandas as pd
from pymongo import MongoClient

import config
import overalls
import weather
import benedict

In [2]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.float_format = '{:,.2f}'.format

  pd.set_option('display.max_colwidth', -1)


In [3]:
def _connect_mongo(uri, db):
    """ A util for making a connection to mongo """

    if uri:
#         mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(uri)
    else:
        conn = MongoClient(host, port)
    return conn[db]

def read_mongo(uri, db, collection, query={}, no_id=True, limit=None):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    con = _connect_mongo(uri, db)

    # Make a query to the specific DB and Collection
    ### SET THE LIMIT TO 5 FOR DEV AND TESTING ###
    if limit:
        cursor = con[collection].find(query)
        cursor = cursor[limit]
    else:
        cursor = con[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df = pd.DataFrame(list(cursor))
    # Delete the _id
#     if no_id:
#         del df['_id']
    if limit:
        print(f'number of indexes created has been limited to {limit} ..........................')

    return df

def forecast_fields(row):
    row['rain_3h'] = row['rain'].get('3h', 0)
    row['rain_1h'] = row['rain'].get('1h', row['rain_3h'] / 3)
    del row['rain']
    return row

def read_mongo_a(uri, db, collection, limit=None):
    ''' Retrieve data from the Mongo database and transform it to a pandas
    DataFrame; return the DataFrame.

    :param collection: the collection name
    :type collection: string
    :param limit: optional limiter to the number of documents retrieved. '''
    
    database = _connect_mongo(config.uri, db)
    col = {}
    
    cursor = database[collection].find({})
    # Shorten the cursor length if limit is given, otherwise get everything;
    # transform the retrieved data to a pandas.DataFrame and return it.
    dfs = []
    for doc in cursor[:limit]:
        dfs.append(
            pd.DataFrame.from_dict([{
                'zipcode': doc['zipcode'],
                'instant': doc['instant'],
                'type': 'weather',
                **doc['weather']
            }], orient='columns').\
            apply(forecast_fields, axis=1))
        dfs.append(
            pd.DataFrame.from_dict(
                [{
                    'zipcode': doc['zipcode'],
                    'instant': doc['instant'],
                    'type': 'forecast',
                    **forecast
                 } for forecast in doc['forecasts']]
            ).apply(forecast_fields, axis=1)
        )
    if limit:
        print(f'The length of your df has been limited to {limit}.')

    return pd.concat(dfs).set_index(['zipcode', 'instant', 'time_to_instant'], drop=True)
    
    # Let the user know that even though there were no errors the DataFrame
    # was not created.
    print('From read_mongo(): There were no errors, but your dataframe was not \
    created.')

def errors(casts, obs):
    ''' Make a dict of errors for the forecasts. Any dicts in the arguments
    will be flattened before comparison.
    
    :param casts: a list of dictionaries
    :param obs: a dictionary
    
    * For best results all dicts should have all the same keys and subkeys.
    '''
    
    # Flatten all dicts and compare. Add the comparisons to a list and return.
    casts = [overalls.flatten_dict(cast) for cast in casts]
    obs = overalls.flatten_dict(obs)
    return [overalls.compare_dicts(cast, obs) for cast in casts]

def gen_errs_df(df):
    ''' Create an errors dataframe from the argument.
    
    :param df: Must be a pandas DataFrame.
    '''
    ### is there a way to step through three lists together? ###
    errs_list = []
    errs_dict = {}
    # Create the error dicts list to be added to the errs_dict.
    for (obs, casts) in zip(df['weather'], df['forecasts']): ### this creates a the list of errors from each instant
        errs_list.append(errors(casts, obs))
#     df['errs'] = errs
#     print(df.head())
    for (_id, errs) in zip(df['_id'], errs_list):  ### this creates a dict from the errors list with the index as key
        errs_dict[_id] = errs
#     errs_dict
    dd = pd.DataFrame.from_dict(errs_dict, orient='index') ### this puts that dict into DataFrame form
#     print(dd.head())
    for c in dd.columns:  ### this replaces the errors DataFrame dictionaries with a list of their values 
        dd[c] = [list(d.values()) for d in dd[c]]
    return dd


In [48]:
collection = 'legit_inst'
db = 'owmap'
# df = read_mongo(config.uri, db, collection, no_id=False, limit=5)
df = read_mongo_a(config.uri, db, collection, limit=10)
df.head()

The length of your df has been limited to 10.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,type,sunset_time,sunrise_time,clouds,snow,wind,humidity,pressure,temperature,status,detailed_status,weather_code,weather_icon_name,visibility_distance,dewpoint,humidex,heat_index,rain_3h,rain_1h
zipcode,instant,time_to_instant,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27006,1592352000,7197,weather,1592354436,1592301882,100,{},"{'speed': 4.92, 'deg': 0, 'gust': 5.36}",89,"{'press': 1021, 'sea_level': None}","{'temp': 286.48, 'temp_kf': None, 'temp_max': 286.48, 'temp_min': 286.48}",Rain,moderate rain,501,10d,,,,,0.0,1.02
27006,1592352000,431035,forecast,0,0,15,{},"{'speed': 3.2, 'deg': 54}",89,"{'press': 1018, 'sea_level': 1018}","{'temp': 293.04, 'temp_kf': 0, 'temp_max': 293.04, 'temp_min': 293.04}",Rain,moderate rain,501,10d,,,,,4.94,1.65
27006,1592352000,395036,forecast,0,0,100,{},"{'speed': 3.61, 'deg': 38}",95,"{'press': 1021, 'sea_level': 1021}","{'temp': 288.48, 'temp_kf': 0, 'temp_max': 288.48, 'temp_min': 288.48}",Rain,light rain,500,10d,,,,,1.69,0.56
27006,1592352000,380634,forecast,0,0,100,{},"{'speed': 4.6, 'deg': 31}",86,"{'press': 1022, 'sea_level': 1022}","{'temp': 288.11, 'temp_kf': 0, 'temp_max': 288.11, 'temp_min': 288.11}",Clouds,overcast clouds,804,04d,,,,,0.0,0.0
27006,1592352000,377038,forecast,0,0,100,{},"{'speed': 4.6, 'deg': 31}",86,"{'press': 1022, 'sea_level': 1022}","{'temp': 288.11, 'temp_kf': 0, 'temp_max': 288.11, 'temp_min': 288.11}",Clouds,overcast clouds,804,04d,,,,,0.0,0.0


## Convert individual instants to dictionaries(or csv)

In [None]:
from pprint import pprint

def my_learn_function(instant, weather):
    print('do a thing with:')
    print('weather: ', end='')
    pprint(weather)
    print('instant: ', instant)

for index, instant in df.groupby(['zipcode', 'instant']):
    
    my_learn_function(
        instant[instant.type == 'forecast'].reset_index().drop(columns=['zipcode', 'instant', 'type']).to_dict('records'),
        instant[instant.type == 'weather'].reset_index().drop(columns=['zipcode', 'instant', 'type']).to_dict()
    )
    break

In [6]:
df.to_csv('test.csv')

In [10]:
for w in df['weather']:
    w.pop('sunset_time', 'sunrise_time')
    w.pop('sunrise_time')
for l in df['forecasts']:
    for f in l:
        f.pop('sunset_time', 'sunrise_time')
        f.pop('sunrise_time')


In [17]:
# err_dict = compare_dicts()
errs = []
for index, row in df[['forecasts', 'weather']].iterrows():
#     for cast in forecast:
     errs.append(errors(row['forecasts'], row['weather']))
df['errs'] = errs
# df.head()

In [18]:
def strip_keys(e):
    ''' Take a list of dictionaries and return a  list of lists of those
    dictionary values. 
    
    :param e: a list of dicts
    :type e: At least in forecast-forecast the input type will be a pandas
    Series, but in general it can take any list-like object of dictionaries.
    :return: a list of lists of dict values w/o keys
    '''
    
    errs = []
    for d in e:
        errs.append([list(overalls.flatten_dict(dic).values()) for dic in d])
    return errs

err_vals = strip_keys(df['errs'])
cast_vals = strip_keys(df['forecasts'])
dd = pd.DataFrame([err_vals, cast_vals], index=['errors', 'forecasts'])
dd = dd.transpose()


In [19]:
dd.head()

Unnamed: 0,errors,forecasts
0,"[[-1, 0.62, 116, 7, 1, -2.57, -4.55, -0.66, 0,...","[[0, 2.79, 182, 84, 1019, 1019, 289.71, 0, 289..."
1,"[[-1, 0.62, 116, 7, 1, -2.3, -4.55, -0.66, 0, ...","[[0, 2.79, 182, 84, 1019, 1019, 289.71, 0, 289..."
2,"[[-1, 0.62, 116, 7, 1, -2.3, -4.55, -0.66, 0, ...","[[0, 2.79, 182, 84, 1019, 1019, 289.71, 0, 289..."
3,"[[-1, 0.62, 116, 7, 1, -2.3, -4.55, -0.66, 0, ...","[[0, 2.79, 182, 84, 1019, 1019, 289.71, 0, 289..."
4,"[[-1, -0.14, 130, 6, 1, -2.49, -4.45, -1.12, 0...","[[0, 2.45, 184, 83, 1019, 1019, 289.81, 0, 289..."


In [21]:
filename = '~/data/forecast-forecast/Learn/error_set3.csv'
dd.to_csv(filename, float_format='%.3f')

In [8]:
obs = df['weather'][4]
# print(df['weather'][4])
# overalls.flatten(obs)
errs = overalls.compare_dicts(obs, df['weather'][3])
# print(overalls.flatten_dict(errs), '\n')
W = weather.Weather('27606', 'observation', data=errs)
V = weather.Weather('27006', 'observation', data=errs)

{} {}
{} {}
{'speed': 2.59, 'deg': 54} {'speed': 2.17, 'deg': 66}
{'press': 1018, 'sea_level': None} {'press': 1018, 'sea_level': None}
{'temp': 292.3, 'temp_kf': None, 'temp_max': 294.26, 'temp_min': 290.93} {'temp': 292.01, 'temp_kf': None, 'temp_max': 294.26, 'temp_min': 290.37}


In [9]:
overalls.all_values(W.weather)

['DEFAULT',
 0,
 0.42,
 -12,
 0,
 0,
 0.29,
 0.0,
 0.56,
 0,
 0,
 0,
 0,
 'DEFAULT',
 'DEFAULT',
 'DEFAULT',
 44,
 17,
 -2,
 0]

In [10]:
W.weather

{'_id': 'DEFAULT',
 'clouds': 0,
 'rain': {},
 'snow': {},
 'wind': {'speed': 0.42, 'deg': -12},
 'humidity': 0,
 'pressure': {'press': 0},
 'temperature': {'temp': 0.29, 'temp_max': 0.0, 'temp_min': 0.56},
 'status': 0,
 'detailed_status': 0,
 'weather_code': 0,
 'visibility_distance': 0,
 'dewpoint': 'DEFAULT',
 'humidex': 'DEFAULT',
 'heat_index': 'DEFAULT',
 'time_to_instant': 44,
 'sunset_time': 17,
 'sunrise_time': -2,
 'weather_icon_name': 0}

In [11]:
errs = []
for (obs, casts) in zip(df['weather'], df['forecasts']):
    errs.append(errors(casts, obs))


5

In [13]:
# errs[0]

In [15]:
df['errors'] = errs
df.head()

Unnamed: 0,_id,instant,zipcode,forecasts,weather,errors
5eba42b80923c58fcff13738,5eba42b80923c58fcff13738,1589695200,27152,"[{'sunset_time': 0, 'sunrise_time': 0, 'clouds...","{'sunset_time': 1589674843, 'sunrise_time': 15...","[{'sunset_time': -1589674843, 'sunrise_time': ..."
5eba42b80923c58fcff13739,5eba42b80923c58fcff13739,1589695200,27155,"[{'sunset_time': 0, 'sunrise_time': 0, 'clouds...","{'sunset_time': 1589674843, 'sunrise_time': 15...","[{'sunset_time': -1589674843, 'sunrise_time': ..."
5eba42b80923c58fcff1373a,5eba42b80923c58fcff1373a,1589695200,27157,"[{'sunset_time': 0, 'sunrise_time': 0, 'clouds...","{'sunset_time': 1589674843, 'sunrise_time': 15...","[{'sunset_time': -1589674843, 'sunrise_time': ..."
5eba42b80923c58fcff1373b,5eba42b80923c58fcff1373b,1589695200,27198,"[{'sunset_time': 0, 'sunrise_time': 0, 'clouds...","{'sunset_time': 1589674843, 'sunrise_time': 15...","[{'sunset_time': -1589674843, 'sunrise_time': ..."
5eba42b80923c58fcff1373c,5eba42b80923c58fcff1373c,1589695200,27199,"[{'sunset_time': 0, 'sunrise_time': 0, 'clouds...","{'sunset_time': 1589674860, 'sunrise_time': 15...","[{'sunset_time': -1589674860, 'sunrise_time': ..."
