In [1]:
def discard_incomplete(record):
    """Filters out records that don't have geolocation information."""
    if 'geolocation' in record and 'year' in record:
        yield record

In [4]:
!head -n 15 meteor.json

[ {
  "fall" : "Fell",
  "year" : "1880-01-01T00:00:00",
  "nametype" : "Valid",
  "mass" : "21",
  "name" : "Aachen",
  "recclass" : "L5",
  "reclat" : "50.775000",
  "reclong" : "6.083330",
  "id" : "1",
  "geolocation" : {
    "latitude" : "50.775",
    "needs_recoding" : false,
    "longitude" : "6.08333"
  }


In [5]:
def convert_types(record):
    """Converts string values to their appropriate type."""
    # Only the year part of the datetime string is significant
    record['year'] = int(record['year'][:4])

    record['mass'] = float(record['mass']) if 'mass' in record else None

    geolocation = record['geolocation']
    geolocation['latitude'] = float(geolocation['latitude'])
    geolocation['longitude'] = float(geolocation['longitude'])

    return record

In [13]:
import ujson

In [39]:
file = '/Users/aspiela/notebooks/nasa/nasa.json'
data = []

with open(file) as f:
    data = f.readlines()

In [41]:
with open(file) as f:
    data = ujson.load(f)

In [45]:
tmp = data[0]
tmp

{'fall': 'Fell',
 'year': '1880-01-01T00:00:00',
 'nametype': 'Valid',
 'mass': '21',
 'name': 'Aachen',
 'recclass': 'L5',
 'reclat': '50.775000',
 'reclong': '6.083330',
 'id': '1',
 'geolocation': {'latitude': '50.775',
  'needs_recoding': False,
  'longitude': '6.08333'}}

In [49]:
import pandas as pd

In [152]:
df = pd.read_json(file)

In [88]:
df.head()

Unnamed: 0,fall,geolocation,id,mass,name,nametype,recclass,reclat,reclong,year
0,Fell,"{'latitude': '50.775', 'needs_recoding': False...",1,21.0,Aachen,Valid,L5,50.775,6.08333,1880-01-01T00:00:00
1,Fell,"{'latitude': '56.18333', 'needs_recoding': Fal...",2,720.0,Aarhus,Valid,H6,56.18333,10.23333,1951-01-01T00:00:00
2,Fell,"{'latitude': '54.21667', 'needs_recoding': Fal...",6,107000.0,Abee,Valid,EH4,54.21667,-113.0,1952-01-01T00:00:00
3,Fell,"{'latitude': '16.88333', 'needs_recoding': Fal...",10,1914.0,Acapulco,Valid,Acapulcoite,16.88333,-99.9,1976-01-01T00:00:00
4,Fell,"{'latitude': '-33.16667', 'needs_recoding': Fa...",370,780.0,Achiras,Valid,L6,-33.16667,-64.95,1902-01-01T00:00:00


In [153]:
df = df.drop(['geolocation'], axis=1)

In [120]:
df.describe(include='all')

Unnamed: 0,fall,id,mass,name,nametype,recclass,reclat,reclong,year
count,38226,38226.0,38116.0,38226,38226,38226,38226.0,38226.0,38226
unique,2,,,38226,2,424,,,268
top,Found,,,Allan Hills 88050,Valid,L6,,,1979-01-01T00:00:00
freq,37129,,,1,38157,7525,,,3046
mean,,25347.871135,15600.31,,,,-39.37879,61.196212,
std,,17391.728324,628673.5,,,,46.317605,80.756809,
min,,1.0,0.0,,,,-87.36667,-165.43333,
25%,,10846.25,6.63,,,,-76.71667,0.0,
50%,,21735.5,29.09,,,,-71.5,35.66667,
75%,,39905.75,187.41,,,,0.0,157.16667,


In [154]:
df.dropna(subset=['reclat', 'reclong', 'year'], how='any', axis=0, inplace=True)

In [161]:
df1 = df[['year']].sort_values(by='year').to_set_index('year')

In [181]:
type(df1.index)

pandas.core.indexes.base.Index

In [183]:
df

Unnamed: 0,fall,id,mass,name,nametype,recclass,reclat,reclong,year
0,Fell,1,21.0,Aachen,Valid,L5,50.77500,6.08333,1880-01-01T00:00:00
1,Fell,2,720.0,Aarhus,Valid,H6,56.18333,10.23333,1951-01-01T00:00:00
2,Fell,6,107000.0,Abee,Valid,EH4,54.21667,-113.00000,1952-01-01T00:00:00
3,Fell,10,1914.0,Acapulco,Valid,Acapulcoite,16.88333,-99.90000,1976-01-01T00:00:00
4,Fell,370,780.0,Achiras,Valid,L6,-33.16667,-64.95000,1902-01-01T00:00:00
5,Fell,379,4239.0,Adhi Kot,Valid,EH4,32.10000,71.80000,1919-01-01T00:00:00
6,Fell,390,910.0,Adzhi-Bogdo (stone),Valid,LL3-6,44.83333,95.16667,1949-01-01T00:00:00
7,Fell,392,30000.0,Agen,Valid,H5,44.21667,0.61667,1814-01-01T00:00:00
8,Fell,398,1620.0,Aguada,Valid,L6,-31.60000,-65.23333,1930-01-01T00:00:00
9,Fell,417,1440.0,Aguila Blanca,Valid,L,-30.86667,-64.55000,1920-01-01T00:00:00


### df['year'].sort_values().value_counts()

In [190]:
def remove_T(line):
    if type(line) is str:
        date, time = line.split('T')
        return " ".join([date, time])
    else:
        return line
df1 = df[['year']].applymap(lambda x: remove_T(x))


In [241]:
df2 = df1[df1.applymap(lambda x: (x.split('-'))[0] or '0') \
   .sort_values(by='year') \
   .astype(int)>1700]

In [242]:
df2 = df2.apply(pd.to_datetime)

In [228]:
df2.set_index('year', inplace=True)

In [246]:
from datetime import datetime

In [248]:
datetime(1981, 12, 23)

datetime.datetime(1981, 12, 23, 0, 0)

In [265]:
df3 = df2[df2['year']>datetime(1981, 12, 1) and df2['year']<datetime(1982, 12, 31)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [263]:
df3

Unnamed: 0,year
351,1982-01-01
816,1982-01-01
1064,1982-01-01
1107,1982-01-01
1660,1982-01-01
1661,1982-01-01
1662,1982-01-01
1663,1982-01-01
1664,1982-01-01
1665,1982-01-01


In [212]:
pd.D

<function pandas.core.indexes.datetimelike.DatetimeIndexOpsMixin.min(self, axis=None, *args, **kwargs)>

In [134]:
daty = pd.to_datetime(df['year'], errors='coerce')

In [137]:
from datetime import datetime

KeyError: datetime.datetime(1981, 12, 23, 0, 0)