# Import Data

In [1]:
import os
import databricks.koalas as ks
from haversine import haversine



In [2]:
fileList = os.listdir("./GlobalSurface/2019")
csvYearly = [i for i in fileList[:50]]
csvYearly.sort()

In [3]:
# dfList = [ks.read_csv("/user/hadoop/GlobalSurface/2019/" + i) for i in csvYearly]
# dfList = [ks.read_csv("file:///home/hadoop/GlobalSurface/2019/" + i) for i in csvYearly]
dfList = [ks.read_csv("./GlobalSurface/2019/" + i) for i in csvYearly]
globalSurfaceDf = ks.concat(dfList)

In [4]:
# icaoDf = ks.read_csv("/user/hadoop/mini_icao/mini_ICAO.csv")
# icaoDf = ks.read_csv("file:///home/hadoop/mini_icao/mini_ICAO.csv")
icaoDf = ks.read_csv("./mini_icao/mini_ICAO.csv")

In [5]:
outputDf = ks.DataFrame(icaoDf["icao_code"])

# Distance between Airport and Station

In [6]:
# retrieve needed features from icao data
df1 = icaoDf.loc[:, ["ident", "latitude_deg", "longitude_deg", "elevation_ft"]]
# elevation feet to m
df1["elevation_ft"] = df1["elevation_ft"] * 0.3048
df1 = df1.rename(columns = {"elevation_ft": "elevation_m"})

# retrieve needed features from station data
# dropna
df2 = globalSurfaceDf.dropna(axis = 0)
df2 = df2.loc[:, ["STATION", "LATITUDE", "LONGITUDE", "ELEVATION"]].drop_duplicates()
# change data type
df2["LATITUDE"] = df2["LATITUDE"].astype(float)
df2["LONGITUDE"] = df2["LONGITUDE"].astype(float)
df2["ELEVATION"] = df2["ELEVATION"].astype(float)

# cartesian combine icao data and station data
df3 = df1.assign(key = 1).merge(df2.assign(key = 1), on = "key").drop("key", axis = 1)

# calculate the distance between two latitude and longitude points
def distance(row):
    lat1 = row["latitude_deg"]
    long1 = row["longitude_deg"]
    airport = (lat1, long1)
    
    lat2 = row["LATITUDE"]
    long2 = row["LONGITUDE"]
    station = (lat2, long2)
    
    result = haversine(airport, station)
    return result

# calculate distance by using haversine
ks.set_option("compute.ops_on_diff_frames", True)
df3["airport_station_distance"] = df3.apply(lambda x: distance(x), axis = 1)

# take the minimum distance
df4 = df3.iloc[df3.groupby("ident")["airport_station_distance"].idxmin().tolist(), :]

# prepare to output
df4 = df4.rename(columns={"ident": "icao_code"})

# integrate to output dataframe
outputDf = ks.merge(outputDf, df4, left_on = "icao_code", right_on = "icao_code", how = "left")

# delete temporary objects
del df1, df2, df3, df4

# Station Average

In [7]:
# dropna
df = globalSurfaceDf.dropna(axis = 0)

# calculate features average
df_temp = df.loc[df["TEMP"] != 9999.9]
df_temp = df_temp.groupby(["STATION"]).mean()

df_vis = df.loc[df["VISIB"] != 999.9]
df_vis = df_vis.groupby(["STATION"]).mean()

df_wd = df.loc[df["WDSP"] != 999.9]
df_wd = df_wd.groupby(["STATION"]).mean()

df_max = df.loc[df["MAX"] != 9999.9]
df_max = df_max.groupby(["STATION"]).mean()

df_min = df.loc[df["MIN"] != 9999.9]
df_min = df_min.groupby(["STATION"]).mean()

df_prcp = df.loc[df["PRCP"] != 99.99]
df_prcp = df_prcp.groupby(["STATION"]).mean()

# average temperature (Farenheit to Celsius)
AVG_TEMP = 5 / 9 * (df_temp["TEMP"] - 32)
AVG_TEMP = AVG_TEMP.rename("AVG_TEMP")

# average visibility (mile to km)
AVG_VISIB = df_vis["VISIB"] * 1.61
AVG_VISIB = AVG_VISIB.rename("AVG_VISIB")

# average wind speed (knot)
AVG_WDSP = df_wd["WDSP"]
AVG_WDSP = AVG_WDSP.rename("AVG_WDSP")

# average maximum temperature (Farenheit to Celsius)
AVG_MAX = 5 / 9 * (df_max["MAX"] - 32)
AVG_MAX = AVG_MAX.rename("AVG_MAX")

# average minimum temperature (Farenheit to Celsius)
AVG_MIN = 5 / 9 * (df_min["MIN"] - 32)
AVG_MIN = AVG_MIN.rename("AVG_MIN")

# average precipitation (inch to mm)
AVG_PRCP = df_prcp["PRCP"] * 25.4
AVG_PRCP = AVG_PRCP.rename("AVG_PRCP")

# combine columns needed to new dataframe
ks.set_option("compute.ops_on_diff_frames", True)
df_concat = ks.concat([AVG_TEMP, AVG_VISIB, AVG_WDSP, AVG_MAX, AVG_MIN, AVG_PRCP], axis = 1)

# integrate to output dataframe
outputDf = ks.merge(outputDf, df_concat, left_on = "STATION", right_index = True, how = "left")

# delete temporary objects
del df, df_temp, df_vis, df_wd, df_max, df_min, df_prcp, AVG_TEMP, AVG_VISIB, AVG_WDSP, AVG_MAX, AVG_MIN, AVG_PRCP, df_concat

# Export Data

In [8]:
# outputDf.to_csv("/user/hadoop/mini_2019_station_table", index = False)
# outputDf.to_csv("file:///home/hadoop/mini_2019_station_table", index = False)
outputDf.to_csv("./mini_2019_station_table", index = False)