In [None]:
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

path_to_fares = "./fares.csv"

# more things that I would be interested in looking at: 
# fares by quarter, fares by airline by quarter, price per dist unit 


In [None]:
# open the data file
fares_df = pd.read_csv(path_to_fares)
fares_df.describe()

In [None]:
# Some things that I want to try off of the bat:
# model the average price by airline
# model the distance vs the price
# model the distance vs. price per airline
# average price by origin airport
# average price by destination airport
# average price by quarter -- not done yet

# create the dictionaries that we need:
avg_price_by_airline = {}
avg_dist_by_airline = {}
airline_dist_v_price = {}
avg_price_origin_airport = {}
avg_price_dest_airport = {}


In [None]:
# first, distance vs. price by airline
airline_codes = fares_df["airline_id"].unique()

across_amt = 4

# go through each of the airlines
for idx, code in enumerate(airline_codes):
    
    # create the x-data (the price)
    # create the y-data (the distance)
    
    print(f"ANALYZING CODE: {code}")
    
    # get only the data for that airline
    airplane_code_df = fares_df[fares_df["airline_id"] == code]

    # create the arrays that will store the data
    x_dat_dist_price = []
    y_dat_dist_price = []
    
    # total price for the airline
    total_cost = 0
    total_dist = 0
    total_amt = 0
    
    # go through the airliner flights
    for row_idx, row in airplane_code_df.iterrows():
        
        # go through the buckets
        for i in range(10, 2509, 10):
        
            # get the average cost and the distance
            total_cost += i * row[str(i)]
            total_dist += row["distance"] * row[str(i)]
            total_amt += int(row[str(i)])
         
        # append the data
        x_dat_dist_price.append(row["distance"])
        y_dat_dist_price.append(total_cost / total_amt)
    
    # assigning the dictionary indicies to the right values
    airline_dist_v_price[code] = sorted(zip(x_dat_dist_price, y_dat_dist_price))
    avg_price_by_airline[code] = (total_cost / total_amt)
    avg_dist_by_airline[code] = (total_dist / total_amt) 
    print(f"AVG PRICE: {(total_cost / total_amt)}\tAVG DIST: {(total_dist / total_amt)}")
    
 

In [None]:
# now get the data based off of the airports
   
avg_price_origin_airport = {}
avg_price_dest_airport = {}
avg_dist_origin_airport = {}
avg_dist_dest_airport = {}
     
# getting the origin and destination codes for the airports
origin_codes = fares_df["origin_airport"].unique()
dest_codes = fares_df["destination_airport"].unique()

new_data = []

# go through each of the origin airports
for idx, code in enumerate(origin_codes):
    
#     print(f"ANALYZING CODE: {code}")
    
    # get only the data for that airport
    airport_code_df = fares_df[fares_df["origin_airport"] == code]
    
    total_cost = 0
    total_dist = 0
    total_amt = 0
    
    # go ahead and get the total cost, amount of tickets, and total distance for each airport
    
    # get the total distance for the airport
    total_dist += airport_code_df[str(i)].sum()
    
    for i in range(10, 2509, 10):
    
        total_cost += airport_code_df[str(i)].sum() * i
        total_amt += airport_code_df[str(i)].sum()
        
        temp_mul = airport_code_df['distance'] * airport_code_df[str(i)]
        total_dist += temp_mul.sum()
    
    new_data.append({'Airport': code, 'avg_dist': (total_dist / total_amt), 'avg_price': (total_cost / total_amt), "total_amt": total_amt})

origin_airports_data = pd.DataFrame(new_data)


# now get the data based off of the destination airport
new_data = []

# go through each of the origin airports
for idx, code in enumerate(dest_codes):
    
#     print(f"ANALYZING CODE: {code}")
    
    # get only the data for that airport
    airport_code_df = fares_df[fares_df["destination_airport"] == code]
    
    total_cost = 0
    total_dist = 0
    total_amt = 0
    
    # go ahead and get the total cost, amount of tickets, and total distance for each airport
    
    # get the total distance for the airport
    total_dist += airport_code_df[str(i)].sum()
    
    for i in range(10, 2509, 10):
    
        total_cost += airport_code_df[str(i)].sum() * i
        total_amt += airport_code_df[str(i)].sum()
        
        temp_mul = airport_code_df['distance'] * airport_code_df[str(i)]
        total_dist += temp_mul.sum()
    
    new_data.append({'Airport': code, 'avg_dist': (total_dist / total_amt), 'avg_price': (total_cost / total_amt), "total_amt": total_amt})

dest_airports_data = pd.DataFrame(new_data)
 

In [None]:
# now we are going to plot everything that is important here
across_amt = 3
fig, axs = plt.subplots(int(len(airline_codes) / across_amt) + 1, 3, figsize=(15, 90))

# print(f"GRAPH DIMS: {int(len(airline_codes) / across_amt) + 1}")
print(airline_codes)


## The important data:
# avg_price_by_airline = {}
# avg_dist_by_airline = {}
# airline_dist_v_price = {}
# avg_price_origin_airport = {}
# avg_price_dest_airport = {}

# first plot the price vs. the distance per airline
for idx, code in enumerate(airline_codes):
    
#     print(airline_dist_v_price[code])
    
    sorted_x, sorted_y = zip(*airline_dist_v_price[code])
    
#     print(idx % across_amt, int(idx / across_amt))
    
    axs[int(idx / across_amt), idx % across_amt].plot(sorted_x, sorted_y, 'o')
    axs[int(idx / across_amt), idx % across_amt].set_title(str(code) + " -- Dist v. Price (Dist - x)")

# Add spacing between subplots
# plt.tight_layout()

# Display the plots
plt.show()

In [None]:
# Create a grid of subplots

# now just print the rest of our data
# airlines first
for code in airline_codes:
    
    print(f"{code}\ta_price: {avg_price_by_airline[code]}\ta_dist: {avg_dist_by_airline[code]}")

for i in range(len(origin_airports_data)):
    
    print(f"\n{code}\t: {origin_airports_data.loc[i]}")
    
    

In [None]:
for i in range(len(dest_airports_data)):
    
    print(f"{code}\t: {dest_airports_data.loc[i]}")