In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from math import radians, sin, cos, sqrt, atan2

df = pd.read_csv("/kaggle/input/flights-data-in-atlanta/non_helicopter_7columns_2s_remove_1_30mins_flights_data.csv")
df

Unnamed: 0,time,icao24,lat,lon,velocity,callsign,geoaltitude
0,1478877250,a76f66,33.754944,-84.449102,232.091797,UAL272,11399.52
1,1478877252,a76f66,33.754944,-84.449102,232.142534,UAL272,11399.52
2,1478877254,a76f66,33.754944,-84.449102,232.091797,UAL272,11399.52
3,1478877256,a76f66,33.754944,-84.449102,232.091797,UAL272,11399.52
4,1478877258,a76f66,33.767830,-84.450645,232.091797,UAL272,11399.52
...,...,...,...,...,...,...,...
5364517,1585086304,a33e4a,33.770762,-84.437428,247.481787,AAY883,12260.58
5364518,1585086306,a33e4a,33.766525,-84.437331,247.481787,AAY883,12260.58
5364519,1585086308,a33e4a,33.761856,-84.437218,247.481787,AAY883,12260.58
5364520,1585086310,a33e4a,33.757416,-84.437162,247.481787,AAY883,12260.58


In [86]:
# Show the number of flight segments

df_1 = df.copy()
# Calculate the time difference between consecutive rows
df_1['time_diff'] = df_1['time'].diff()

# Set a threshold for the maximum duration between two consecutive data points within the same flight segment
threshold = 30   # 30 secs

# Identify points where the time difference exceeds the threshold
df_1['new_segment'] = (df_1['time_diff'] > threshold).cumsum()

# Split the dataframe into separate flight segments
flight_segments_time_based = {name: group for name, group in df_1.groupby('new_segment')}

# Display the information about the flight segments based on time threshold
flight_segment_time_info = pd.DataFrame({
    'Segment': list(flight_segments_time_based.keys()),
    'Start Time': [group['time'].min() for group in flight_segments_time_based.values()],
    'End Time': [group['time'].max() for group in flight_segments_time_based.values()],
    'Duration (mins)': [(group['time'].max() - group['time'].min()) / 60 for group in flight_segments_time_based.values()]
})

# Show the DataFrame with flight segment information
flight_segment_time_info

Unnamed: 0,Segment,Start Time,End Time,Duration (mins)
0,0,1478877250,1478877600,5.833333
1,1,1479127856,1479127976,2.000000
2,2,1479138502,1479138584,1.366667
3,3,1479145560,1479145898,5.633333
4,4,1479154456,1479154762,5.100000
...,...,...,...,...
52649,52649,1585082690,1585082782,1.533333
52650,52650,1585083354,1585083462,1.800000
52651,52651,1585084836,1585084914,1.300000
52652,52652,1585085840,1585086026,3.100000


In [87]:
# Show the flight segments sorted by duration

flight_segment_time_info_sorted = flight_segment_time_info.sort_values(by='Duration (mins)', ascending=False)

print(flight_segment_time_info_sorted)

       Segment  Start Time    End Time  Duration (mins)
32691    32691  1556305156  1556306928        29.533333
33452    33452  1557073596  1557075274        27.966667
33914    33914  1557511590  1557513266        27.933333
8686      8686  1510030190  1510031834        27.400000
48387    48387  1571936628  1571938232        26.733333
...        ...         ...         ...              ...
7799      7799  1507916964  1507917024         1.000000
11963    11963  1516620758  1516620818         1.000000
47357    47357  1571061066  1571061126         1.000000
27016    27016  1548259868  1548259928         1.000000
49776    49776  1573051616  1573051670         0.900000

[52654 rows x 4 columns]


In [88]:
# Plot the ADS-B detection boundaries

!pip install plotly
import plotly.express as px

# Convert 'lat' and 'lon' columns to numeric values, coercing errors to NaN
df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
df['lon'] = pd.to_numeric(df['lon'], errors='coerce')

# Find the minimum and maximum values of 'lat' and 'lon' columns
min_lat = df['lat'].min()
max_lat = df['lat'].max()
min_lon = df['lon'].min()
max_lon = df['lon'].max()

# Print the results
print(f"Minimum latitude: {min_lat}")
print(f"Maximum latitude: {max_lat}")
print(f"Minimum longitude: {min_lon}")
print(f"Maximum longitude: {max_lon}")

data = {
    'lat': [min_lat, max_lat, max_lat, min_lat, min_lat],
    'lon': [min_lon, min_lon, max_lon, max_lon, min_lon],
}

# Create DataFrame for min and max values
df_2 = pd.DataFrame(data)

# Plot the min and max values on a map using Plotly
fig = px.line_mapbox(df_2, lat="lat", lon="lon", zoom=3, height=600, title="ADS-B detecting boundaries")
fig.update_layout(mapbox_style="open-street-map")
fig.show()

Minimum latitude: 33.75
Maximum latitude: 33.849994206832626
Minimum longitude: -84.47999954223633
Maximum longitude: -84.38003540039062


In [89]:

# Haversine formula to calculate the distance between two points on the Earth
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers

    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c

    return distance

# Calculate the lengths of the sides
side_1 = haversine(df_2['lat'][0], df_2['lon'][0], df_2['lat'][1], df_2['lon'][1])
side_2 = haversine(df_2['lat'][1], df_2['lon'][1], df_2['lat'][2], df_2['lon'][2])
side_3 = haversine(df_2['lat'][2], df_2['lon'][2], df_2['lat'][3], df_2['lon'][3])
side_4 = haversine(df_2['lat'][3], df_2['lon'][3], df_2['lat'][4], df_2['lon'][4])

# Print the lengths of each side
print(f"Length of side 1: {side_1:.2f} km")
print(f"Length of side 2: {side_2:.2f} km")
print(f"Length of side 3: {side_3:.2f} km")
print(f"Length of side 4: {side_4:.2f} km")

Length of side 1: 11.12 km
Length of side 2: 9.23 km
Length of side 3: 11.12 km
Length of side 4: 9.24 km


In [90]:
# Find pairs of 'callsign' and 'icao24' with the most records

df_2 = df.groupby(['callsign', 'icao24']).size().reset_index(name='count')
df_2 = df_2.sort_values(by='count', ascending=False)
df_2.head(5)

Unnamed: 0,callsign,icao24,count
53831,N9298S,ace31a,41799
53382,N86HD,abcd31,30909
49459,N314GT,a35655,27838
52237,N707WB,a97085,27709
54039,N9693Y,ad806d,25869


In [91]:
# Seperate N86HD callsign from the dataset

df_n86hd = df[(df['icao24'] == 'abcd31') & (df['callsign'] == 'N86HD')]
df_n86hd

Unnamed: 0,time,icao24,lat,lon,velocity,callsign,geoaltitude
15841,1480615332,abcd31,33.795901,-84.464951,73.174121,N86HD,518.16
15842,1480615334,abcd31,33.795901,-84.464951,72.258783,N86HD,510.54
15843,1480615336,abcd31,33.795901,-84.464951,72.258783,N86HD,510.54
15844,1480615338,abcd31,33.792892,-84.467543,69.497557,N86HD,487.68
15845,1480615340,abcd31,33.792037,-84.468441,67.676276,N86HD,472.44
...,...,...,...,...,...,...,...
5265114,1584046368,abcd31,33.787674,-84.471130,65.714054,N86HD,373.38
5265115,1584046370,abcd31,33.787674,-84.471130,65.714054,N86HD,373.38
5265116,1584046372,abcd31,33.787674,-84.471130,65.714054,N86HD,373.38
5265117,1584046374,abcd31,33.787674,-84.471130,65.714054,N86HD,373.38


In [92]:
# Change the unix timestamp to human time, drop unnecessary columns

from datetime import datetime
df_n86hd['time'] = [datetime.fromtimestamp(x) for x in df_n86hd['time']]
df_n86hd = df_n86hd.drop(columns=['icao24', 'velocity', 'callsign'])
df_n86hd



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,time,lat,lon,geoaltitude
15841,2016-12-01 18:02:12,33.795901,-84.464951,518.16
15842,2016-12-01 18:02:14,33.795901,-84.464951,510.54
15843,2016-12-01 18:02:16,33.795901,-84.464951,510.54
15844,2016-12-01 18:02:18,33.792892,-84.467543,487.68
15845,2016-12-01 18:02:20,33.792037,-84.468441,472.44
...,...,...,...,...
5265114,2020-03-12 20:52:48,33.787674,-84.471130,373.38
5265115,2020-03-12 20:52:50,33.787674,-84.471130,373.38
5265116,2020-03-12 20:52:52,33.787674,-84.471130,373.38
5265117,2020-03-12 20:52:54,33.787674,-84.471130,373.38


In [93]:
# Show the number of N86HD flight segments

# Calculate the time difference between consecutive rows
df_n86hd['time_diff'] = df_n86hd['time'].diff().dt.total_seconds()

# Set a threshold for the maximum duration between two consecutive data points within the same flight segment
threshold = 30  # 30 secs

# Identify points where the time difference exceeds the threshold
df_n86hd['flight_segment'] = (df_n86hd['time_diff'] > threshold).cumsum()

# Split the dataframe into separate flight segments
flight_segments_time_based_n86hd = {name: group for name, group in df_n86hd.groupby('flight_segment')}

# Display the information about the flight segments based on time threshold
flight_segment_time_info_n86hd = pd.DataFrame({
    'Segment': list(flight_segments_time_based_n86hd.keys()),
    'Start Time': [group['time'].min() for group in flight_segments_time_based_n86hd.values()],
    'End Time': [group['time'].max() for group in flight_segments_time_based_n86hd.values()],
    'Duration': [(group['time'].max() - group['time'].min()).total_seconds() / 60 for group in flight_segments_time_based_n86hd.values()]
})

# Show the DataFrame with flight segment information
flight_segment_time_info_n86hd

Unnamed: 0,Segment,Start Time,End Time,Duration
0,0,2016-12-01 18:02:12,2016-12-01 18:08:26,6.233333
1,1,2016-12-02 21:27:10,2016-12-02 21:32:32,5.366667
2,2,2017-01-12 17:35:40,2017-01-12 17:41:16,5.600000
3,3,2017-01-20 20:43:50,2017-01-20 20:49:34,5.733333
4,4,2017-02-23 21:56:44,2017-02-23 21:59:58,3.233333
...,...,...,...,...
206,206,2019-10-23 21:00:54,2019-10-23 21:02:18,1.400000
207,207,2020-02-28 02:23:54,2020-02-28 02:32:14,8.333333
208,208,2020-03-12 12:18:18,2020-03-12 12:18:50,0.533333
209,209,2020-03-12 12:20:56,2020-03-12 12:22:18,1.366667


In [94]:
# Drop 1 column and write to csv

df_n86hd = df_n86hd.drop(columns=['time_diff'])
df_n86hd
df_n86hd.to_csv('df_n86hd_abcd31_data.csv', index = False)

In [95]:
# Show the N86HD flight segments sorted by duration

sorted_flight_segments_n86hd = flight_segment_time_info_n86hd.sort_values(by='Duration', ascending=False)
sorted_flight_segments_n86hd.head(5)

Unnamed: 0,Segment,Start Time,End Time,Duration
98,98,2018-02-23 13:07:34,2018-02-23 13:17:56,10.366667
207,207,2020-02-28 02:23:54,2020-02-28 02:32:14,8.333333
184,184,2019-06-24 17:15:34,2019-06-24 17:23:18,7.733333
185,185,2019-06-26 04:03:58,2019-06-26 04:11:34,7.6
83,83,2017-12-07 20:34:04,2017-12-07 20:41:38,7.566667


In [96]:
# Show information of flight segment 207 of N86HD

segment_207_data = df_n86hd[df_n86hd['flight_segment'] == 207]
segment_207_data.head(5)

Unnamed: 0,time,lat,lon,geoaltitude,flight_segment
5137410,2020-02-28 02:23:54,33.834412,-84.380998,967.74,207
5137411,2020-02-28 02:23:56,33.833084,-84.381838,960.12,207
5137412,2020-02-28 02:23:58,33.831839,-84.38261,952.5,207
5137413,2020-02-28 02:24:00,33.830535,-84.383469,944.88,207
5137414,2020-02-28 02:24:02,33.829102,-84.384361,929.64,207


In [97]:
# Plot flight segment 207 of N86HD

latitudes = segment_207_data['lat']
longitudes = segment_207_data['lon']

# Create DataFrame from the data
data = {'lat': latitudes, 'lon': longitudes}
df_3 = pd.DataFrame(data)

# Draw flight trajectory using Plotly
fig = px.line_mapbox(df_3, lat="lat", lon="lon", zoom=3, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(title="Flight N86HD - segment 207 Trajectory")
fig.show()

In [98]:
df_4 = df_n86hd.copy()
df_4

Unnamed: 0,time,lat,lon,geoaltitude,flight_segment
15841,2016-12-01 18:02:12,33.795901,-84.464951,518.16,0
15842,2016-12-01 18:02:14,33.795901,-84.464951,510.54,0
15843,2016-12-01 18:02:16,33.795901,-84.464951,510.54,0
15844,2016-12-01 18:02:18,33.792892,-84.467543,487.68,0
15845,2016-12-01 18:02:20,33.792037,-84.468441,472.44,0
...,...,...,...,...,...
5265114,2020-03-12 20:52:48,33.787674,-84.471130,373.38,210
5265115,2020-03-12 20:52:50,33.787674,-84.471130,373.38,210
5265116,2020-03-12 20:52:52,33.787674,-84.471130,373.38,210
5265117,2020-03-12 20:52:54,33.787674,-84.471130,373.38,210


In [99]:
# Split the N86HD data into df_train and df_test

from sklearn.model_selection import train_test_split

# Get the unique flight segments
unique_segments = df_4['flight_segment'].unique()

# Split the flight segments into two groups
train_segments, test_segments = train_test_split(unique_segments, test_size=63, random_state=42)

# Filter the dataset based on the split segments
train_df = df_4[df_4['flight_segment'].isin(train_segments)]
test_df = df_4[df_4['flight_segment'].isin(test_segments)]

# Display the number of segments in each dataset to verify
print(f"Number of unique flight segments in train_df: {train_df['flight_segment'].nunique()}")
print(f"Number of unique flight segments in test_df: {test_df['flight_segment'].nunique()}")

# Save the datasets if needed
train_df.to_csv('/kaggle/working/train_n86hd_abcd31.csv', index=False)
test_df.to_csv('/kaggle/working/test_n86hd_abcd31.csv', index=False)


Number of unique flight segments in train_df: 148
Number of unique flight segments in test_df: 63


In [100]:
train_df

Unnamed: 0,time,lat,lon,geoaltitude,flight_segment
15841,2016-12-01 18:02:12,33.795901,-84.464951,518.16,0
15842,2016-12-01 18:02:14,33.795901,-84.464951,510.54,0
15843,2016-12-01 18:02:16,33.795901,-84.464951,510.54,0
15844,2016-12-01 18:02:18,33.792892,-84.467543,487.68,0
15845,2016-12-01 18:02:20,33.792037,-84.468441,472.44,0
...,...,...,...,...,...
5265114,2020-03-12 20:52:48,33.787674,-84.471130,373.38,210
5265115,2020-03-12 20:52:50,33.787674,-84.471130,373.38,210
5265116,2020-03-12 20:52:52,33.787674,-84.471130,373.38,210
5265117,2020-03-12 20:52:54,33.787674,-84.471130,373.38,210


In [101]:
# Remove flight segments under 40 seconds

train_df['time'] = pd.to_datetime(train_df['time'])

# Group by 'flight_segment' and calculate the duration
flight_durations = train_df.groupby('flight_segment').agg(
    start_time=('time', 'min'),
    end_time=('time', 'max')
)

# Calculate the duration in seconds
flight_durations['duration'] = (flight_durations['end_time'] - flight_durations['start_time']).dt.total_seconds()

# Identify segments with duration smaller than 40 seconds
short_flight_segments = flight_durations[flight_durations['duration'] < 40].index

# Remove these segments from train_df
train_df_filtered = train_df[~train_df['flight_segment'].isin(short_flight_segments)]

# Display the filtered DataFrame
print(train_df_filtered)

                       time        lat        lon  geoaltitude  flight_segment
15841   2016-12-01 18:02:12  33.795901 -84.464951       518.16               0
15842   2016-12-01 18:02:14  33.795901 -84.464951       510.54               0
15843   2016-12-01 18:02:16  33.795901 -84.464951       510.54               0
15844   2016-12-01 18:02:18  33.792892 -84.467543       487.68               0
15845   2016-12-01 18:02:20  33.792037 -84.468441       472.44               0
...                     ...        ...        ...          ...             ...
5265114 2020-03-12 20:52:48  33.787674 -84.471130       373.38             210
5265115 2020-03-12 20:52:50  33.787674 -84.471130       373.38             210
5265116 2020-03-12 20:52:52  33.787674 -84.471130       373.38             210
5265117 2020-03-12 20:52:54  33.787674 -84.471130       373.38             210
5265118 2020-03-12 20:52:56  33.787674 -84.471130       373.38             210

[21707 rows x 5 columns]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [102]:
train_df_filtered.to_csv('/kaggle/working/train_n86hd_abcd31.csv', index=False)

In [103]:
# Show the duration of each segment

train_df_filtered['time'] = pd.to_datetime(train_df_filtered['time'])

# Group by 'flight_segment' and calculate the duration
flight_durations = train_df_filtered.groupby('flight_segment').agg(
    start_time=('time', 'min'),
    end_time=('time', 'max')
)

# Calculate the duration in seconds
flight_durations['duration'] = (flight_durations['end_time'] - flight_durations['start_time']).dt.total_seconds()

# Display the DataFrame with calculated durations
print(flight_durations)

                        start_time            end_time  duration
flight_segment                                                  
0              2016-12-01 18:02:12 2016-12-01 18:08:26     374.0
1              2016-12-02 21:27:10 2016-12-02 21:32:32     322.0
2              2017-01-12 17:35:40 2017-01-12 17:41:16     336.0
3              2017-01-20 20:43:50 2017-01-20 20:49:34     344.0
4              2017-02-23 21:56:44 2017-02-23 21:59:58     194.0
...                            ...                 ...       ...
202            2019-10-03 22:07:36 2019-10-03 22:13:44     368.0
203            2019-10-16 19:08:32 2019-10-16 19:09:34      62.0
205            2019-10-22 21:32:00 2019-10-22 21:38:10     370.0
207            2020-02-28 02:23:54 2020-02-28 02:32:14     500.0
210            2020-03-12 20:45:58 2020-03-12 20:52:56     418.0

[143 rows x 3 columns]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [104]:
# Remove flight segments under 40 seconds

test_df['time'] = pd.to_datetime(test_df['time'])

# Group by 'flight_segment' and calculate the duration
flight_durations = test_df.groupby('flight_segment').agg(
    start_time=('time', 'min'),
    end_time=('time', 'max')
)

# Calculate the duration in seconds
flight_durations['duration'] = (flight_durations['end_time'] - flight_durations['start_time']).dt.total_seconds()

# Identify segments with duration smaller than 40 seconds
short_flight_segments = flight_durations[flight_durations['duration'] < 40].index

# Remove these segments from train_df
test_df_filtered = test_df[~test_df['flight_segment'].isin(short_flight_segments)]

# Display the filtered DataFrame
print(test_df_filtered)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                       time        lat        lon  geoaltitude  flight_segment
138179  2017-02-27 21:07:18  33.787009 -84.478741       739.14               5
138182  2017-02-27 21:07:20  33.787614 -84.476223       746.76               5
138185  2017-02-27 21:07:22  33.788040 -84.474269       769.62               5
138188  2017-02-27 21:07:24  33.788592 -84.471874       777.24               5
138191  2017-02-27 21:07:26  33.789104 -84.469585       800.10               5
...                     ...        ...        ...          ...             ...
5260486 2020-03-12 12:22:10  33.762268 -84.405549      1836.42             209
5260487 2020-03-12 12:22:12  33.759659 -84.405156      1836.42             209
5260488 2020-03-12 12:22:14  33.757095 -84.404764      1836.42             209
5260489 2020-03-12 12:22:16  33.754990 -84.404484      1836.42             209
5260490 2020-03-12 12:22:18  33.752335 -84.404091      1836.42             209

[9125 rows x 5 columns]


In [105]:
# Show the duration of each segment

test_df_filtered['time'] = pd.to_datetime(test_df_filtered['time'])

# Group by 'flight_segment' and calculate the duration
flight_durations = test_df_filtered.groupby('flight_segment').agg(
    start_time=('time', 'min'),
    end_time=('time', 'max')
)

# Calculate the duration in seconds
flight_durations['duration'] = (flight_durations['end_time'] - flight_durations['start_time']).dt.total_seconds()

# Display the DataFrame with calculated durations
print(flight_durations)

                        start_time            end_time  duration
flight_segment                                                  
5              2017-02-27 21:07:18 2017-02-27 21:08:40      82.0
9              2017-03-07 19:14:48 2017-03-07 19:21:42     414.0
15             2017-04-11 22:01:42 2017-04-11 22:08:48     426.0
16             2017-04-12 21:30:24 2017-04-12 21:37:36     432.0
18             2017-04-19 12:14:50 2017-04-19 12:21:24     394.0
...                            ...                 ...       ...
195            2019-08-23 00:48:42 2019-08-23 00:55:46     424.0
200            2019-09-12 11:38:38 2019-09-12 11:39:52      74.0
204            2019-10-16 23:59:48 2019-10-17 00:05:56     368.0
206            2019-10-23 21:00:54 2019-10-23 21:02:18      84.0
209            2020-03-12 12:20:56 2020-03-12 12:22:18      82.0

[61 rows x 3 columns]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [106]:
test_df_filtered.to_csv('/kaggle/working/test_n86hd_abcd31.csv', index=False)