In [1]:
import os
import csv
import pandas as pd
from DatetimeExtractor import *

claims_folder = os.path.join("..", "..", "datasets", "flights", "clean_flight")
groundtruth_folder = os.path.join("..", "..","datasets", "flights", "flight_truth")

all_claims_filename = "clean_flight.csv"
all_truth_filename = "flight_truth.csv"

all_claims_file = os.path.join(claims_folder, all_claims_filename)
all_groundtruth_file = os.path.join(groundtruth_folder, all_truth_filename)

In [2]:
header_claims = [
    "source",
    "flight_number",
    "scheduled_departure",
    "actual_departure",
    "departure_gate", 
    "scheduled_arrival",
    "actual_arrival",
    "arrival_gate",
]

In [2]:
def import_gate(gate):
    gate = gate.strip()
    gate = gate.upper()
    if gate == "-" or gate == "--" or gate == "" or gate == "?" or gate == "$" or gate == "NOT PROVIDED BY AIRLINE" or gate is None:
        gate = "-"
    return gate

In [4]:
if os.path.isfile(all_claims_file):
    os.remove(all_claims_file)

In [7]:
clean_claims = list()

# claims input files, ignore any other hidden file in the folder...
claims_files = [f for f in os.listdir(claims_folder) if not f.startswith('.')]

for file_name in claims_files:
    
    file_path = os.path.join(claims_folder, file_name)

    print("Importing: " + file_path)

    with open(file_path, "r", encoding="iso-8859-1") as f:
       
        reader = csv.reader(f, delimiter='\t')
        year, month, day, c = file_name.split("-")
        dt_ex = DatetimeExtractor(int(year), int(month), int(day))

        for i, row in enumerate(reader):
            try:
                source = row[header_claims.index("source")].strip().upper()
                flight = row[header_claims.index("flight_number")].strip().upper()
                departure_gate = import_gate(row[header_claims.index("departure_gate")])  # one hot
                arrival_gate = import_gate(row[header_claims.index("arrival_gate")])  # one hot
                scheduled_departure = dt_ex.get_datetime(row[header_claims.index("scheduled_departure")])
                scheduled_arrival = dt_ex.get_datetime(row[header_claims.index("scheduled_arrival")])
                actual_departure = dt_ex.get_datetime(row[header_claims.index("actual_departure")])
                actual_arrival = dt_ex.get_datetime(row[header_claims.index("actual_arrival")])    
            except Exception as p:
                print(p)
            
            record = [source, flight, scheduled_departure, actual_departure, departure_gate, scheduled_arrival, actual_arrival, arrival_gate]
            clean_claims.append(record)


Importing: ../../datasets/flights/clean_flight/2011-12-16-data.txt
Importing: ../../datasets/flights/clean_flight/2012-01-01-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-22-data.txt
Importing: ../../datasets/flights/clean_flight/2012-01-03-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-09-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-10-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-14-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-04-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-24-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-11-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-28-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-29-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-01-data.txt
Importing: ../../datasets/flights/clean_flight/2011-12-30-data.txt
Importing: ../../datasets/flights/clean_flight/2012-01-02-data

In [8]:
claims = pd.DataFrame(clean_claims)
claims.columns = header_claims
claims.head(5)

Unnamed: 0,source,flight_number,scheduled_departure,actual_departure,departure_gate,scheduled_arrival,actual_arrival,arrival_gate
0,CO,CO-4237-EWR-DTW,2011-12-16 12:17:00,2011-12-16 12:11:00,-,2011-12-16 14:24:00,2011-12-16 14:17:00,-
1,AIRTRAVELCENTER,CO-4237-EWR-DTW,2011-12-16 12:17:00,NaT,-,2011-12-16 14:24:00,NaT,-
2,MYRATEPLAN,CO-4237-EWR-DTW,2011-12-16 12:17:00,NaT,-,2011-12-16 14:24:00,NaT,-
3,HELLOFLIGHT,CO-4237-EWR-DTW,2011-12-16 12:17:00,NaT,-,2011-12-16 14:24:00,NaT,-
4,FLYTECOMM,CO-4237-EWR-DTW,2011-12-16 12:17:00,NaT,-,2011-12-16 14:24:00,NaT,-


In [9]:
claims.to_csv(all_claims_file, sep=",")

In [5]:
header_truth = [
    "flight_number",
    "scheduled_departure",
    "actual_departure",
    "departure_gate", 
    "scheduled_arrival",
    "actual_arrival",
    "arrival_gate",
]

In [6]:
if os.path.isfile(all_groundtruth_file):
    os.remove(all_groundtruth_file)

In [7]:
clean_truth = list()

groundtruth_files = [f for f in os.listdir(groundtruth_folder) if not f.startswith('.')]

for file_name in groundtruth_files:
    
    file_path = os.path.join(groundtruth_folder, file_name)

    print("Importing: " + file_path)

    with open(file_path, "r", encoding="iso-8859-1") as f:
        
        reader = csv.reader(f, delimiter='\t')
        year, month, day, c = file_name.split("-")
        dt_ex = DatetimeExtractor(int(year), int(month), int(day))

        for i, row in enumerate(reader):
            try:
                flight = row[header_truth.index("flight_number")].strip().upper()
                departure_gate = import_gate(row[header_truth.index("departure_gate")])
                arrival_gate = import_gate(row[header_truth.index("arrival_gate")])
                scheduled_departure = dt_ex.get_datetime(row[header_truth.index("scheduled_departure")])
                scheduled_arrival = dt_ex.get_datetime(row[header_truth.index("scheduled_arrival")])
                actual_departure = dt_ex.get_datetime(row[header_truth.index("actual_departure")])
                actual_arrival = dt_ex.get_datetime(row[header_truth.index("actual_arrival")])    
            except Exception as p:
                print(p)
            
            record = [flight, scheduled_departure, actual_departure, departure_gate, scheduled_arrival, actual_arrival, arrival_gate]
            clean_truth.append(record)

Importing: ../../datasets/flights/flight_truth/2011-12-30-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-27-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-31-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-25-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-20-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-22-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-15-truth.txt
Importing: ../../datasets/flights/flight_truth/2012-01-03-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-28-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-11-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-07-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-02-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-04-truth.txt
Importing: ../../datasets/flights/flight_truth/2011-12-16-truth.txt
Importing: ../../datasets/flights/flight_truth/2

In [9]:
groundtruth = pd.DataFrame(clean_truth)
groundtruth.columns = header_truth
groundtruth.head(5)

Unnamed: 0,flight_number,scheduled_departure,actual_departure,departure_gate,scheduled_arrival,actual_arrival,arrival_gate
0,AA-1221-MCO-ORD,2011-12-30 20:00:00,2011-12-30 20:21:00,15,2011-12-30 21:50:00,2011-12-30 22:00:00,H10
1,AA-4307-ORD-DTW,2011-12-30 18:45:00,2011-12-30 18:42:00,G20,2011-12-30 21:10:00,2011-12-30 20:58:00,D28
2,AA-616-DFW-DTW,2011-12-30 09:05:00,2011-12-30 09:01:00,C17,2011-12-30 12:35:00,2011-12-30 12:31:00,D32
3,AA-431-MIA-SFO,2011-12-30 08:35:00,2011-12-30 09:10:00,D40,2011-12-30 11:50:00,2011-12-30 00:00:00,58B
4,AA-3756-ORD-SLC,2011-12-30 12:15:00,2011-12-30 12:11:00,G19,2011-12-30 14:45:00,2011-12-30 14:45:00,A3


In [10]:
groundtruth.to_csv(all_groundtruth_file, sep=",")

In [15]:
# groundtruth -> pandas dataframe with the groundtruth
# claims -> pandas dataframe with all the claims
# claims = pd.read_csv(all_claims_file)
# groundtruth = pd.read_csv(all_groundtruth_file)

#     Source    Object  Property   Value   Categorical
# 0     A         o1      p1         b          1
# 1     B         o1      p1         a          1
# 2     C         o1      p1         a          1
# 3     A         o2      p2         2          0
# 4     B         o2      p2         1          0
# 5     C         o2      p2         1          0
# 6     A         o3      p3         a          1

In [11]:
transformed_claims = list()

In [12]:
header_trans_claims = ["Source", "Object", "Property", "Value", "Categorical"]

In [11]:
from datetime import datetime

def get_datetime_minutes(date_to_convert):
    if pd.isnull(date_to_convert):
        date_to_convert = datetime(year=2012, month=12, day=1)
    start_date = datetime(year=2011, month=12, day=1)
    delta = date_to_convert - start_date
    delta_minutes = round(delta.total_seconds() / 60, 3)
    return delta_minutes

In [19]:
# header_claims = [
#    "source",
#    "flight_number",
#    "scheduled_departure",
#    "actual_departure",
#    "departure_gate", 
#    "scheduled_arrival",
#    "actual_arrival",
#    "arrival_gate",
# ]

print("Claims to transform : %i " %len(claims))

for i, el in claims.iterrows():
    
    sd = [el["source"], el["flight_number"], "scheduled_departure", get_datetime_minutes(el["scheduled_departure"]), 0]
    ad = [el["source"], el["flight_number"], "actual_departure", get_datetime_minutes(el["actual_departure"]), 0]
    dg = [el["source"], el["flight_number"], "departure_gate", el["departure_gate"], 1]
    sa = [el["source"], el["flight_number"], "scheduled_arrival", get_datetime_minutes(el["scheduled_arrival"]), 0]
    aa = [el["source"], el["flight_number"], "actual_arrival", get_datetime_minutes(el["actual_arrival"]), 0]
    ag = [el["source"], el["flight_number"], "arrival_gate", el["arrival_gate"], 1]
    
    transformed_claims.extend([sd, ad, dg, sa, aa, ag])
    
    if int( (i)/len(claims) * 100 ) < int( (i+1)/len(claims) * 100 ) and int( (i+1)/len(claims) * 100 ) % 10 == 0:
        print(int( (i+1)/len(claims) * 100 ), "% done...")

Claims to transform : 776067 
10 % done...
20 % done...
30 % done...
40 % done...
50 % done...
60 % done...
70 % done...
80 % done...
90 % done...
100 % done...


In [21]:
header_transformed_claims = ["Source", "Object", "Property", "Value", "Categorical"]
pd_transformed_claims = pd.DataFrame(transformed_claims)
pd_transformed_claims.columns = header_transformed_claims

In [24]:
pd_transformed_claims.head(10)

Unnamed: 0,Source,Object,Property,Value,Categorical
0,CO,CO-4237-EWR-DTW,scheduled_departure,22337,0
1,CO,CO-4237-EWR-DTW,actual_departure,22331,0
2,CO,CO-4237-EWR-DTW,departure_gate,-,1
3,CO,CO-4237-EWR-DTW,scheduled_arrival,22464,0
4,CO,CO-4237-EWR-DTW,actual_arrival,22457,0
5,CO,CO-4237-EWR-DTW,arrival_gate,-,1
6,AIRTRAVELCENTER,CO-4237-EWR-DTW,scheduled_departure,22337,0
7,AIRTRAVELCENTER,CO-4237-EWR-DTW,actual_departure,527040,0
8,AIRTRAVELCENTER,CO-4237-EWR-DTW,departure_gate,-,1
9,AIRTRAVELCENTER,CO-4237-EWR-DTW,scheduled_arrival,22464,0


In [12]:
# header_truth = [
#    "flight_number",
#    "scheduled_departure",
#    "actual_departure",
#    "departure_gate", 
#    "scheduled_arrival",
#    "actual_arrival",
#    "arrival_gate",
# ]

transformed_truth = list()
header_transformed_truth = ["Object", "Property", "Value", "Categorical"]

print("Truth to transform : %i " %len(groundtruth))

for i, el in groundtruth.iterrows():
    
    sd = [el["flight_number"], "scheduled_departure", get_datetime_minutes(el["scheduled_departure"]), 0]
    ad = [el["flight_number"], "actual_departure", get_datetime_minutes(el["actual_departure"]), 0]
    dg = [el["flight_number"], "departure_gate", el["departure_gate"], 1]
    sa = [el["flight_number"], "scheduled_arrival", get_datetime_minutes(el["scheduled_arrival"]), 0]
    aa = [el["flight_number"], "actual_arrival", get_datetime_minutes(el["actual_arrival"]), 0]
    ag = [el["flight_number"], "arrival_gate", el["arrival_gate"], 1]
    
    transformed_truth.extend([sd, ad, dg, sa, aa, ag])
    
    if int( (i)/len(groundtruth) * 100 ) < int( (i+1)/len(groundtruth) * 100 ) and int( (i+1)/len(groundtruth) * 100 ) % 10 == 0:
        print(int( (i+1)/len(groundtruth) * 100 ), "% done...")

Truth to transform : 2986 
10 % done...
20 % done...
30 % done...
40 % done...
50 % done...
60 % done...
70 % done...
80 % done...
90 % done...
100 % done...


In [13]:
pd_transformed_truth = pd.DataFrame(transformed_truth)
pd_transformed_truth.columns = header_transformed_truth

In [15]:
pd_transformed_truth.head(10)

Unnamed: 0,Object,Property,Value,Categorical
0,AA-1221-MCO-ORD,scheduled_departure,42960,0
1,AA-1221-MCO-ORD,actual_departure,42981,0
2,AA-1221-MCO-ORD,departure_gate,15,1
3,AA-1221-MCO-ORD,scheduled_arrival,43070,0
4,AA-1221-MCO-ORD,actual_arrival,43080,0
5,AA-1221-MCO-ORD,arrival_gate,H10,1
6,AA-4307-ORD-DTW,scheduled_departure,42885,0
7,AA-4307-ORD-DTW,actual_departure,42882,0
8,AA-4307-ORD-DTW,departure_gate,G20,1
9,AA-4307-ORD-DTW,scheduled_arrival,43030,0


In [28]:
print("Total Claims: ", len(pd_transformed_claims))

Total Claims:  4656402


In [16]:
print("Ground Truth: ", len(pd_transformed_truth))

Ground Truth:  17916


In [17]:
pd_transformed_truth


Unnamed: 0,Object,Property,Value,Categorical
0,AA-1221-MCO-ORD,scheduled_departure,42960,0
1,AA-1221-MCO-ORD,actual_departure,42981,0
2,AA-1221-MCO-ORD,departure_gate,15,1
3,AA-1221-MCO-ORD,scheduled_arrival,43070,0
4,AA-1221-MCO-ORD,actual_arrival,43080,0
5,AA-1221-MCO-ORD,arrival_gate,H10,1
6,AA-4307-ORD-DTW,scheduled_departure,42885,0
7,AA-4307-ORD-DTW,actual_departure,42882,0
8,AA-4307-ORD-DTW,departure_gate,G20,1
9,AA-4307-ORD-DTW,scheduled_arrival,43030,0


In [31]:
pd_transformed_claims.head(0
                          )

Unnamed: 0,Source,Object,Property,Value,Categorical


In [14]:
OUTPUT_FILE = "flight_truth.csv"

In [15]:
if (os.path.exists(OUTPUT_FILE)):
    os.remove(OUTPUT_FILE)
    print("Removed: " + OUTPUT_FILE)
dataset = open(OUTPUT_FILE, "w")
print("Created empty file: " + OUTPUT_FILE)

Removed: flight_truth.csv
Created empty file: flight_truth.csv


In [18]:
pd_transformed_truth['Object'][i]

'AA-1221-MCO-ORD'

In [19]:
csv_writer=csv.writer(dataset)
for i in range(10):
    csv_writer.writerow(pd_transformed_truth['Source'][i])
    csv_writer.write(pd_transformed_truth['Object'][i])
    csv_writer.write(pd_transformed_truth['Property'][i])
    csv_writer.write(pd_transformed_truth['Value'][i])
    csv_writer.write(pd_transformed_truth['Categorical'][i])
    csv_writer.write("\n")
    
f.close()

AttributeError: '_csv.writer' object has no attribute 'write'

In [21]:
pd_transformed_truth.to_csv(OUTPUT_FILE)

In [32]:
OUTPUT_FILE = "flight.csv"

In [33]:
if (os.path.exists(OUTPUT_FILE)):
    os.remove(OUTPUT_FILE)
    print("Removed: " + OUTPUT_FILE)
dataset = open(OUTPUT_FILE, "w")
print("Created empty file: " + OUTPUT_FILE)

Created empty file: flight.csv


In [40]:
csv_writer=csv.writer(dataset)
for i in pd_transformed_claims.head(50):
    print (i)
    csv_writer.writerow(i)
f.close()
    

Source
Object
Property
Value
Categorical


In [41]:
pd_transformed_claims


Unnamed: 0,Source,Object,Property,Value,Categorical
0,CO,CO-4237-EWR-DTW,scheduled_departure,22337,0
1,CO,CO-4237-EWR-DTW,actual_departure,22331,0
2,CO,CO-4237-EWR-DTW,departure_gate,-,1
3,CO,CO-4237-EWR-DTW,scheduled_arrival,22464,0
4,CO,CO-4237-EWR-DTW,actual_arrival,22457,0
5,CO,CO-4237-EWR-DTW,arrival_gate,-,1
6,AIRTRAVELCENTER,CO-4237-EWR-DTW,scheduled_departure,22337,0
7,AIRTRAVELCENTER,CO-4237-EWR-DTW,actual_departure,527040,0
8,AIRTRAVELCENTER,CO-4237-EWR-DTW,departure_gate,-,1
9,AIRTRAVELCENTER,CO-4237-EWR-DTW,scheduled_arrival,22464,0


In [49]:
pd_transformed_claims.all()

Source          True
Object          True
Property        True
Value          False
Categorical    False
dtype: bool