In [73]:
import pymongo
import json
import pandas as pd
from pprint import pprint

## 1.Data Cleaning, Transformation, and Storage

In [32]:
hurricane_dataset = 'hurdat2-1851-2021-100522.txt'
raw_df = pd.read_csv(hurricane_dataset, header=None, sep="|")

#### (a) Use the code and instructions provided by the instructor during the in-class demo to convert the dataset to a proper DataFrame having the following columns: DATE, TIME_UTC, POINT_TYPE, STATUS, LATITUDE, LONGITUDE, MAX_WINDSPEED_KT, MIN_PRESURE_MB, NE_34KT, SE_34KT, NW_34_KT, SW_34_KT, NE_50KT, SE_50KT, NW_50_KT, SW_50_KT, NE_64KT, SE_64KT, NW_64_KT, SW_64_KT, RADIUS, BASIN, ATCF_CYCLONE_NUMBER, YEAR, NAME, NUM_BEST_TRACK_ENTRIES


In [33]:
storms = {}
header = None
for line in raw_df[0]:
    tokens = line.split(',')
    if len(tokens) == 4:
        header = line
        storms[header] = []
    elif len(tokens) == 21:
        row = [tokens.strip() for tokens in tokens]
        storms[header].append(row)

In [34]:
frames = []
for storm in storms:
    code, name, entries, blank = [record.strip() for record in storm.split(',')]
    basic_columns = {
        'BASIN': code[:2],
        'ATCF_CY_NUMBER': code[2:4],
        'YEAR' : code[4:9],
        'NAME' : name,
        'NUM_BEST_TRACK_ENTRIES' : entries
    }
    
    dataset = pd.DataFrame(storms[storm], 
                           columns=['DATE',
                                    'TIME_UTC',
                                    'POINT_TYPE',
                                    'STATUS',
                                    'LATITUDE',
                                    'LONGITUDE',
                                    'MAX_WINDSPEED_KT',
                                    'MIN_PRESURE_MB',
                                    'NE_34KT',
                                    'SE_34KT',
                                    'NW_34_KT',
                                    'SW_34_KT',
                                    'NE_50KT',
                                    'SE_50KT',
                                    'NW_50_KT',
                                    'SW_50_KT',
                                    'NE_64KT',
                                    'SE_64KT',
                                    'NW_64_KT',
                                    'SW_64_KT',
                                    'RADIUS']
                          )
    for column in basic_columns:
        dataset[column] = basic_columns[column]
    frames.append(dataset)
final_df = pd.concat(frames)

In [35]:
final_df

Unnamed: 0,DATE,TIME_UTC,POINT_TYPE,STATUS,LATITUDE,LONGITUDE,MAX_WINDSPEED_KT,MIN_PRESURE_MB,NE_34KT,SE_34KT,...,NE_64KT,SE_64KT,NW_64_KT,SW_64_KT,RADIUS,BASIN,ATCF_CY_NUMBER,YEAR,NAME,NUM_BEST_TRACK_ENTRIES
0,18510625,0000,,HU,28.0N,94.8W,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,01,1851,UNNAMED,14
1,18510625,0600,,HU,28.0N,95.4W,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,01,1851,UNNAMED,14
2,18510625,1200,,HU,28.0N,96.0W,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,01,1851,UNNAMED,14
3,18510625,1800,,HU,28.1N,96.5W,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,01,1851,UNNAMED,14
4,18510625,2100,L,HU,28.2N,96.8W,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,01,1851,UNNAMED,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,20211107,0000,,TS,37.4N,37.4W,35,1003,0,60,...,0,0,0,0,40,AL,21,2021,WANDA,54
50,20211107,0600,,TS,38.1N,36.4W,35,1004,0,60,...,0,0,0,0,45,AL,21,2021,WANDA,54
51,20211107,1200,,LO,39.2N,34.9W,35,1006,0,90,...,0,0,0,0,50,AL,21,2021,WANDA,54
52,20211107,1800,,LO,40.9N,32.8W,40,1006,0,90,...,0,0,0,0,50,AL,21,2021,WANDA,54


#### (b) Convert the latitudes and longitudes from hemispheric values (NSEW) to float values (southern and western values should be negative). Display the head of your dataset to show the changes.


In [36]:
def HemisphericToFloat(value):
    # print(type(value))
    if(value[-1] == 'S' or value[-1] == 'W'):
        return(-float(value[:-1]))
    
    return(float(value[:-1]))

In [37]:
final_df['LATITUDE'] = final_df['LATITUDE'].apply(HemisphericToFloat)
final_df['LONGITUDE'] = final_df['LONGITUDE'].apply(HemisphericToFloat)

In [38]:
final_df.head()

Unnamed: 0,DATE,TIME_UTC,POINT_TYPE,STATUS,LATITUDE,LONGITUDE,MAX_WINDSPEED_KT,MIN_PRESURE_MB,NE_34KT,SE_34KT,...,NE_64KT,SE_64KT,NW_64_KT,SW_64_KT,RADIUS,BASIN,ATCF_CY_NUMBER,YEAR,NAME,NUM_BEST_TRACK_ENTRIES
0,18510625,0,,HU,28.0,-94.8,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,1,1851,UNNAMED,14
1,18510625,600,,HU,28.0,-95.4,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,1,1851,UNNAMED,14
2,18510625,1200,,HU,28.0,-96.0,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,1,1851,UNNAMED,14
3,18510625,1800,,HU,28.1,-96.5,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,1,1851,UNNAMED,14
4,18510625,2100,L,HU,28.2,-96.8,80,-999,-999,-999,...,-999,-999,-999,-999,-999,AL,1,1851,UNNAMED,14


#### (c) Convert the following fields to integers and display the data types of your dataset to show the changes. MAX_WINDSPEED_KT, MIN_PRESURE_MB, NUM_BEST_TRACK_ENTRIES, NE_34KT, SE_34KT, NW_34_KT, SW_34_KT, NE_50KT, SE_50KT, NW_50_KT, SW_50_KT, NE_64KT, SE_64KT, NW_64_KT, SW_64_KT, RADIUS


In [39]:
integer = ['MAX_WINDSPEED_KT',
'MIN_PRESURE_MB',
'NUM_BEST_TRACK_ENTRIES',
'NE_34KT',
'SE_34KT',
'NW_34_KT',
'SW_34_KT',
'NE_50KT',
'SE_50KT',
'NW_50_KT',
'SW_50_KT',
'NE_64KT',
'SE_64KT',
'NW_64_KT',
'SW_64_KT',
'RADIUS']
final_df[integer] = final_df[integer].astype(int)

In [40]:
final_df.dtypes

DATE                       object
TIME_UTC                   object
POINT_TYPE                 object
STATUS                     object
LATITUDE                  float64
LONGITUDE                 float64
MAX_WINDSPEED_KT            int64
MIN_PRESURE_MB              int64
NE_34KT                     int64
SE_34KT                     int64
NW_34_KT                    int64
SW_34_KT                    int64
NE_50KT                     int64
SE_50KT                     int64
NW_50_KT                    int64
SW_50_KT                    int64
NE_64KT                     int64
SE_64KT                     int64
NW_64_KT                    int64
SW_64_KT                    int64
RADIUS                      int64
BASIN                      object
ATCF_CY_NUMBER             object
YEAR                       object
NAME                       object
NUM_BEST_TRACK_ENTRIES      int64
dtype: object

#### (d) Export the dataset as a JSON file where each record is an element of a JSON list.

In [41]:
final_df.to_json('stormdata.json', orient ='records', lines=True)

#### (e) Create a table named storms in MongoDB. Choose appropriate fields/values
#### (f) (5 points) Write your dataset to your MongoDB storms table with all fields.

In [51]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient["StormDB"]


In [52]:
collection = db['Storms']
with open('stormdata.json') as json_file:
    # recipesobj = json.loads(json_file)
    # print(recipesobj)

    
    for line in json_file:
        storm_obj = json.loads(line)
        print('Inserting:',storm_obj['DATE'], storm_obj['TIME_UTC'])
        
        insert_result = collection.insert_one({
                    'DATE' : storm_obj['DATE'],
                    'TIME' : storm_obj['TIME_UTC'],
                    'BASIN': storm_obj.get('BASIN'),
                    'ATCF_CY_NUMBER': storm_obj.get('ATCF_CY_NUMBER'),
                    'YEAR' : storm_obj.get('YEAR'),
                    'NAME' : storm_obj.get('NAME'),
                    'NUM_BEST_TRACK_ENTRIES' :storm_obj.get('NUM_BEST_TRACK_ENTRIES'),
                    'DATE': storm_obj.get('DATE'),
                    'TIME_UTC': storm_obj.get('TIME_UTC'),
                    'POINT_TYPE': storm_obj.get('POINT_TYPE'),
                    'STATUS': storm_obj.get('STATUS'),
                    'LATITUDE': storm_obj.get('LATITUDE'),
                    'LONGITUDE': storm_obj.get('LONGITUDE'),
                    'MAX_WINDSPEED_KT': storm_obj.get('MAX_WINDSPEED_KT'),
                    'MIN_PRESURE_MB': storm_obj.get('MIN_PRESURE_MB'),
                    'NE_34KT': storm_obj.get('NE_34KT'),
                    'SE_34KT': storm_obj.get('SE_34KT'),
                    'NW_34_KT': storm_obj.get('NW_34_KT'),
                    'SW_34_KT': storm_obj.get('SW_34_KT'),
                    'NE_50KT': storm_obj.get('NE_50KT'),
                    'SE_50KT': storm_obj.get('SE_50KT'),
                    'NW_50_KT': storm_obj.get('NW_50_KT'),
                    'SW_50_KT': storm_obj.get('SW_50_KT'),
                    'NE_64KT': storm_obj.get('NE_64KT'),
                    'SE_64KT': storm_obj.get('SE_64KT'),
                    'NW_64_KT': storm_obj.get('NW_64_KT'),
                    'SW_64_KT': storm_obj.get('SW_64_KT'),
                    'RADIUS': storm_obj.get('RADIUS')
                    })
        
    

Inserting: 18510625 0000
Inserting: 18510625 0600
Inserting: 18510625 1200
Inserting: 18510625 1800
Inserting: 18510625 2100
Inserting: 18510626 0000
Inserting: 18510626 0600
Inserting: 18510626 1200
Inserting: 18510626 1800
Inserting: 18510627 0000
Inserting: 18510627 0600
Inserting: 18510627 1200
Inserting: 18510627 1800
Inserting: 18510628 0000
Inserting: 18510705 1200
Inserting: 18510710 1200
Inserting: 18510816 0000
Inserting: 18510816 0600
Inserting: 18510816 1200
Inserting: 18510816 1800
Inserting: 18510817 0000
Inserting: 18510817 0600
Inserting: 18510817 1200
Inserting: 18510817 1800
Inserting: 18510818 0000
Inserting: 18510818 0600
Inserting: 18510818 1200
Inserting: 18510818 1800
Inserting: 18510819 0000
Inserting: 18510819 0600
Inserting: 18510819 1200
Inserting: 18510819 1800
Inserting: 18510820 0000
Inserting: 18510820 0600
Inserting: 18510820 1200
Inserting: 18510820 1800
Inserting: 18510821 0000
Inserting: 18510821 0600
Inserting: 18510821 1200
Inserting: 18510821 1800


# 2. Query
### (a) Create a query that finds recipes for breakfast foods (the description contains "breakfast" or "Breakfast"). How many are there

In [53]:
db.Storms.count_documents({})

53501

#### (b) Given the following legend, which represents the status of a storm, return a cumulative count of records in 2020 and 2021 with the HU classification.
TD – Tropical cyclone of tropical depression intensity (< 34 knots)

TS – Tropical cyclone of tropical storm intensity (34-63 knots)

HU – Tropical cyclone of hurricane intensity (> 64 knots)

EX – Extratropical cyclone (of any intensity)

SD – Subtropical cyclone of subtropical depression intensity (< 34 knots)

SS – Subtropical cyclone of subtropical storm intensity (> 34 knots)

LO – A low that is neither a tropical cyclone, a subtropical cyclone, nor an extratropical cyclone (of any intensity)

WV – Tropical Wave (of any intensity)

DB – Disturbance (of any intensity)


In [58]:
result = collection.count_documents(
    {
        "$and":[{'STATUS': 'HU'}, {'YEAR': {'$in' : ['2020', '2021']}}]
        
    }
)
print('Count:',result)

Count: 273


#### (c) Given that a Category 4 hurricane has winds ranging from 130-156 mph, return and print out a tabulated list (without duplicates and sorted in ascending order by year) of all Category 4 hurricanes since the year 2000. Print out only the year and the name of the hurricane.

In [148]:
speed_min = 130 * 0.868976
speed_max = 156 * 0.868976
result = collection.find(
    {
        "$and":[{'MAX_WINDSPEED_KT': {'$lte' : speed_max}}, {'MAX_WINDSPEED_KT': {'$gte' : speed_min} },{'YEAR': {'$gte' : '2000'}}]
        
    }
)

filtered_dict = [(d['YEAR'], d['NAME']) for d in result]
set_dict = sorted(set(filtered_dict))
# print(filtered_dict.items())

for doc in set_dict:
    print('Year: {}, Name: {}'.format(doc[0], doc[1]))

Year: 2000, Name: ISAAC
Year: 2000, Name: KEITH
Year: 2001, Name: IRIS
Year: 2001, Name: MICHELLE
Year: 2002, Name: LILI
Year: 2003, Name: FABIAN
Year: 2003, Name: ISABEL
Year: 2004, Name: CHARLEY
Year: 2004, Name: FRANCES
Year: 2004, Name: IVAN
Year: 2004, Name: KARL
Year: 2005, Name: DENNIS
Year: 2005, Name: EMILY
Year: 2005, Name: KATRINA
Year: 2005, Name: RITA
Year: 2005, Name: WILMA
Year: 2007, Name: DEAN
Year: 2007, Name: FELIX
Year: 2008, Name: GUSTAV
Year: 2008, Name: IKE
Year: 2008, Name: OMAR
Year: 2008, Name: PALOMA
Year: 2009, Name: BILL
Year: 2010, Name: DANIELLE
Year: 2010, Name: EARL
Year: 2010, Name: IGOR
Year: 2010, Name: JULIA
Year: 2011, Name: KATIA
Year: 2011, Name: OPHELIA
Year: 2014, Name: GONZALO
Year: 2015, Name: JOAQUIN
Year: 2016, Name: MATTHEW
Year: 2016, Name: NICOLE
Year: 2017, Name: HARVEY
Year: 2017, Name: IRMA
Year: 2017, Name: JOSE
Year: 2017, Name: MARIA
Year: 2018, Name: FLORENCE
Year: 2018, Name: MICHAEL
Year: 2019, Name: DORIAN
Year: 2019, Name: LOR

#### (d) Return the year and maximum sustained winds of the hurricane named Gilbert.

In [118]:
result = collection.find(
    {
        'NAME': 'GILBERT'
    }
)

listResult = list(result)

maxWind = max(listResult, key=lambda x:x['MAX_WINDSPEED_KT'])
print('Maximum Windspeed: {}kt, Year: {}'.format(maxWind['MAX_WINDSPEED_KT'], maxWind['YEAR']))

Maximum Windspeed: 160kt, Year: 1988
