In [53]:
import pandas as pd
import os
import requests as re
import time
import json
import datetime

### Global Variables

variables used for https://api.worldweatheronline.com

In [2]:
#These are global vars for the WorldWeatherOnline API

client_id = "b502a3cc6269461cb6f170139212409"
location = ""
date = ""
enddate = "" #enddate is only needed if you want to return a range
time_period = 1 #This is the weather hourly interval. Default 1 to return each hour. 
q_format = 'json'
url = 'https://api.worldweatheronline.com/premium/v1/past-weather.ashx'

variables used for https://weather.visualcrossing.com when forecasting future dates

In [3]:
#These are global vars for the OpenWeatherMap API
app_key = "CCZNE4QKYVPUF2HHTZE2BRBZB"
include = 'fcst'
f_url = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/'
unit_group = 'us'

In [84]:
def get_24h_weather(df, client_id, time_period=24, q_format='json', url='https://api.worldweatheronline.com/premium/v1/past-weather.ashx', city_origin_or_dep='', frcst_column_name='', frcst_column_rank=''):
    '''This function collects the historical weather data and stores it in a new column of the dataframe. MAXIMUM rows of df = 500
    This function collects the weather forecast for specific days in the future. MAXIMUM rows of df = 500
    
    city_origin_or_dep is string of the column name for city. dest_city_name or origin_city_name
    
    frcst_column_name is string of the name we give to the new column that holds description of weather
    
    frcst_column_rank is a string of the name we give to new column that holds rank of weather
    
    API from https://www.worldweatheronline.com'''
    
    length = df.shape[0]
    
    if length > 500:
        print("Dataframe has more than 500 rows. The weather API has a daily limit of 500")
        return 
    
    date = df['fl_date'].values.tolist()
    
    #Add cities to a list
    cities = []
    for i in df[city_origin_or_dep]:
        i = i.split(',')
        i[0] = i[0].replace(" ", "+")
        cities.append(i[0])
        
    #Create empty list to hold weather data that will later be added as a column
    weather_data = []
    
    count=0
    while count < (length):
        
        if date[count].type == int:
            #convert date to string
            toDate = date[count]
            toDate = datetime.datetime.fromtimestamp(toDate).isoformat()
            toDate = toDate[:10] #remove the hours and minutes
        
        params = {
            'q' : cities[count],
            'date': toDate,
            'tp': time_period,
            'format': q_format,
            'key': client_id
        }
        
        resp = re.get(url=url, params=params)
        data = resp.json()
        
        weather_data.append(data['data']['weather'][0]['hourly'][0]['weatherDesc'][0]['value'])

        count += 1
    
    weather_rank = rank_weather(weather_data) #Call the ranking function to rank the weather in a new list
    print(len(weather_rank), ' weather rank descriptions successfully gathered.')
    print(len(weather_data), ' weather ranks successfully gathered.')
    
    if len(weather_data) == length:#Check to see if lists have the same length as the original df
        df[frcst_column_name] = weather_data
        df[frcst_column_rank] = weather_rank
    else:
        dif = length - len(weather_data) #Get the difference
        
        print(f"Weather_data length does not match. Randomly copying {dif} extra rows to match df")
        
        weather_df = pd.DataFrame({'weather':weather_data, 'weather_rank':weather_rank}) #Create temporary df
        
        insert_rows = weather_df.sample(dif) #Sample random rows to insert into the df
        
        weather_df = weather_df.append(insert_rows, ignore_index=True)
        
        weather_data = weather_df['weather'].values.tolist() #Convert both back to lists
        
        weather_rank = weather_df['weather_rank'].values.tolist()
        
        df[frcst_column_name] = weather_data #Create new columns
        df[frcst_column_rank] = weather_rank
    
    return(df)
    

In [7]:
def get_forecast_weather(df, app_key, include='fcst',unit_group='us',city_origin_or_dep='', frcst_column_name='', frcst_column_rank=''):
    '''This function collects the weather forecast for specific days in the future. MAXIMUM rows of df = 1000
    
    origin_or_dep_time is a string and can either be dep_time or arr_time
    
    city_origin_or_dep is string of the column name for city. dest_city_name or origin_city_name
    
    frcst_column_name is string of the name we give to the new column that holds description of weather
    
    frcst_column_rank is a string of the name we give to new column that holds rank of weather
   
    API from https://www.visualcrossing.com'''
    
    length = df.shape[0]
    
    if length > 1000: #check the length to make sure we can use the API
        print("Dataframe has more than 1000 rows. The weather API has a daily limit of 500")
        return 

    date = df['fl_date'].values.tolist() #save the flight date
    
    #Add cities to a list
    cities = []
    for i in df[city_origin_or_dep]:
        i = i.split(',')
        i[0] = i[0].replace(" ", "%")
        cities.append(i[0])
        
    #Create empty list to hold weather data that will later be added as a column
    weather_data = []
    
    count=0
    while count < (length):
        url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{cities[count]}/{date[count]}?unitGroup={unit_group}&key={app_key}&include={include}"
        
        resp = re.get(url=url)
        data = resp.json()
        
        weather_data.append(data['days'][0]['conditions'])

        count += 1
    
    weather_rank = rank_weather(weather_data) #Call the ranking function to rank the weather in a new list
    
    if len(weather_data) == length: #Check to see if lists have the same length as the original df
        df[frcst_column_name] = weather_data
        df[frcst_column_rank] = weather_rank
    else:
        dif = length - len(weather_data) #Get the difference
        
        print(f"Weather_data length does not match. Randomly copying {dif} extra rows to match df")
        
        weather_df = pd.DataFrame({'weather':weather_data, 'weather_rank':weather_rank}) #Create temporary df
        
        insert_rows = weather_df.sample(dif) #Sample random rows to insert into the df
        
        weather_df = weather_df.append(insert_rows, ignore_index=True)
        
        weather_data = weather_df['weather'].values.tolist() #Convert both back to lists
        
        weather_rank = weather_df['weather_rank'].values.tolist()
        
        df[frcst_column_name] = weather_data #Create new columns
        df[frcst_column_rank] = weather_rank
    
    return(df)
    

In [8]:
def rank_weather(weather_list):
    '''This function ranks weather based on string descriptors'''
    weather_rank = [0 for i in weather_list] #create a list of 0 with the same length as weather_list
    weather = [item.lower() for item in weather_list] #set all values to lower case
    
    weather_rank_1 = ['sunny', 'light', 'patchy', 'clear', 'drizzle', 'partly']
    weather_rank_2 = ['moderate', 'cloudy', 'mist', 'overcast']
    weather_rank_3 = ['blowing', 'fog', 'ice', 'heavy', 'torrential', 'torrent', 'storm', 'thunder', 'thunderstorm']
    weather_rank_4 = ['warning', 'danger', 'extreme', 'alert', 'blizzard']
    
    count = 0
    while count < len(weather_rank): #iterate over each list starting with the highest rank.
        for adjective in weather_rank_4:
            if adjective in weather[count]:
                weather_rank[count] = 4
                break
        for adjective in weather_rank_3:
            if adjective in weather[count]:
                weather_rank[count] = 3
                break
        for adjective in weather_rank_2:
            if adjective in weather[count]:
                weather_rank[count] = 2
                break
        for adjective in weather_rank_1:
            if adjective in weather[count]:
                weather_rank[count] = 1
                break

        count += 1
        
    return(weather_rank) #returns a list

In [68]:
df_flights = pd.read_csv('Documents/GitHub/Midterm_Project/Data/Flights_test - Sample 50000 rand.csv')
df = df_flights.sample(100)

### Sample Function Call

In [85]:
#Get weather for origin city
df = get_24h_weather(df, client_id, time_period, q_format, url, 'origin_city_name', 'dep_weather', 'dep_weather_rank')

100  weather rank descriptions successfully gathered.
100  weather ranks successfully gathered.


In [86]:
#Get weather for dest city
df = get_24h_weather(df, client_id, time_period, q_format, url, 'dest_city_name', 'arr_weather', 'arr_weather_rank')

100  weather rank descriptions successfully gathered.
100  weather ranks successfully gathered.


In [92]:
'''This code adds a date column to the df if the df uses timestamps'''

timestamps = df['fl_date'].values.tolist()
dates = []
for i in timestamps:
    toDate = datetime.datetime.fromtimestamp(i).isoformat()
    toDate = toDate[:10]
    dates.append(toDate)
df['datetime'] = dates
df

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_arr_time,dup,crs_elapsed_time,flights,distance,dep_weather,dep_weather_rank,arr_weather,arr_weather_rank,datetime
30751,1.579680e+09,WN,WN,WN,60,WN,N238WN,60,12191,HOU,...,2200,N,60,1,239,Partly cloudy,1,Patchy rain possible,1,2020-01-22
15004,1.578557e+09,DL,DL,DL,1166,DL,N6715C,1166,10397,ATL,...,2205,N,90,1,404,Clear,1,Clear,1,2020-01-09
29012,1.580198e+09,DL,DL,DL,1590,DL,N860DN,1590,13198,MCI,...,1113,N,122,1,692,Overcast,2,Partly cloudy,1,2020-01-28
40044,1.579853e+09,DL,DL,DL,1994,DL,N387DN,1994,13204,MCO,...,2231,N,154,1,950,Cloudy,2,Overcast,2,2020-01-24
5841,1.579507e+09,AA,AA_CODESHARE,AA,3330,MQ,N858AE,3330,11298,DFW,...,2358,N,138,1,868,Partly cloudy,1,Partly cloudy,1,2020-01-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36350,1.579248e+09,WN,WN,WN,258,WN,N7883A,258,11292,DEN,...,1925,N,180,1,1447,Partly cloudy,1,Patchy rain possible,1,2020-01-17
35010,1.577952e+09,UA,UA,UA,1204,UA,N475UA,1204,11618,EWR,...,2140,N,191,1,1085,Partly cloudy,1,Partly cloudy,1,2020-01-02
25316,1.578038e+09,UA,UA_CODESHARE,UA,4977,C5,N13161,4977,12264,IAD,...,1444,N,134,1,630,Moderate rain,2,Partly cloudy,1,2020-01-03
40798,1.580026e+09,NK,NK,NK,1012,NK,N603NK,1012,14635,RSW,...,1302,N,167,1,1068,Partly cloudy,1,Partly cloudy,1,2020-01-26


In [93]:
df.to_csv('C:/Users/Peter/Desktop/df_test_table.csv')