In [2]:
# necessary libraries
import pandas as pd
import numpy as np
import datetime

In [3]:
# import data from csv file
df = pd.read_csv("Data_from_zindi/Train.csv")

In [4]:
# function to calculate the distances
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [None]:
# create new dataframe consisting of only data with flight status == ATA
df1 = df.loc[df['STATUS'] == "ATA"]

# set data type for start time and landing time columns
df1["STD"] = pd.to_datetime(df1["STD"], format='%Y-%m-%d %H:%M:%S')
df1["STA"] = pd.to_datetime(df1["STA"], format='%Y-%m-%d %H.%M.%S')

# calculate flight duration and set unit to minutes
df1['flight_duration'] = round((df1.STA - df1.STD) / pd.Timedelta(minutes=1))

# import iata data from csv file (source: https://github.com/ip2location/ip2location-iata-icao)
iata_data = pd.read_csv("iata-icao.csv")
iata_data = iata_data.drop(["country_code", "region_name", "icao", "airport"], axis=1)

# create two dataframes to make merging easier (one for departure airport, one for destination airport)
iata_data_dep = iata_data.rename(columns={"iata": "DEPSTN", "latitude": "dep_lat", "longitude": "dep_lon"})
iata_data_arr = iata_data.rename(columns={"iata": "ARRSTN", "latitude": "arr_lat", "longitude": "arr_lon"})

# merge iata data with "original" data frame
df1 = pd.merge(df1, iata_data_arr, on="ARRSTN", how="left")
df1 = pd.merge(df1, iata_data_dep, on="DEPSTN", how="left")

# calculation of flight distance for each flight
df1['flight_distance'] = df1.apply(lambda row: round(haversine(row['dep_lat'], row['dep_lon'], row['arr_lat'], row['arr_lon'])), axis=1)

# drop latitude and longitude columns to slim down dataframe
df1 = df1.drop(["arr_lat", "arr_lon", "dep_lat", "dep_lon"], axis=1)

# export dataframe as csv to make it available for further actions (plots, etc.)
df1.to_csv('flight_data_with_dist_and_dur.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["STD"] = pd.to_datetime(df1["STD"], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["STA"] = pd.to_datetime(df1["STA"], format='%Y-%m-%d %H.%M.%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['flight_duration'] = round((df1.STA - df1.STD) / pd.Timedelta(min