In [1]:
import datalab.storage as storage
import pandas as pd
from io import BytesIO
import numpy as np
import random
import collections
from collections import defaultdict

In [2]:
bucket = storage.Bucket('cs221-flight-data')
flights_data = bucket.item('model-train.csv')
uri = flights_data.uri
%gcs read --object $uri --variable data
bayes_data = pd.read_csv(BytesIO(data))

In [3]:
# CONSTANTS
NUM_TIME_DELAY_BUCKETS = 5
NUM_AIRPLANE_DELAY_BUCKETS = 5

# map the variables to their parents
variables_to_parents = {}
variables_to_parents['MONTH'] = []
variables_to_parents['DAY_OF_WEEK'] = []
variables_to_parents['SCHEDULED_DEPARTURE'] = []
variables_to_parents['AIRLINE'] = []
variables_to_parents['DESTINATION_AIRPORT'] = []
# variables_to_parents['YEAR'] = [] # year is the year of the airplane make NOT the year that the flight departs
# variables_to_parents['MODEL'] = []
variables_to_parents['ARRIVAL_DELAY'] = ['MONTH', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'AIRLINE', 'DESTINATION_AIRPORT']

In [4]:
# initialize the counts dict
variable_to_counts = {}
for var in variables_to_parents:
    variable_to_counts[var] = defaultdict(int)

In [5]:
# count
for index, row in bayes_data.iterrows():
    for (variable, parent_list) in variables_to_parents.items():
      # need to make tuple of value with their parent(conditional) values
      value = str(row[variable])
      if parent_list != []:
        value = [value]
        for parent in parent_list:
          value.append(str(row[parent]))
        value = tuple(value)
      # increase the count
      variable_to_counts[variable][value] += 1

In [6]:
def normalize(variables_to_parents, variable_to_counts):
  variable_to_probability = {}
  for variable in variable_to_counts:
      variable_to_probability[variable] = {}
      if len(variables_to_parents[variable]) > 0:
          # then we need to do some fun tuple stuff because the conditionals are what is constant
          conditional_to_count = {}
          for value in variable_to_counts[variable]:
              conditional = value[1:]
              conditional_to_count[conditional] = conditional_to_count.get(conditional, 0) + variable_to_counts[variable][value]
          # now actually get the percentage
          for value in variable_to_counts[variable]:
              conditional = value[1:]
              variable_to_probability[variable][value] = float(variable_to_counts[variable][value])/conditional_to_count[conditional]
      else:
          # there are no parents
          for value in variable_to_counts[variable]:
              # add all of the counts for all of the values for the variable
              total = sum(variable_to_counts[variable].values())
              variable_to_probability[variable][value] = float(variable_to_counts[variable][value])/total
  return variable_to_probability

In [7]:
# normalize
variable_to_probability = normalize(variables_to_parents, variable_to_counts)

In [13]:
#test 1
test_data = pd.read_csv(BytesIO(data))
true_labels = test_data[['ARRIVAL_DELAY']].values
feature_variables = ['MONTH', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'AIRLINE', 'DESTINATION_AIRPORT']
guessed_labels = []
var_to_index = {'ARRIVAL_DELAY':0, 'MONTH':1, 'DAY_OF_WEEK':2, 'SCHEDULED_DEPARTURE':3, 'AIRLINE':4, 'DESTINATION_AIRPORT':5}
for index, row in bayes_data.iterrows():
  for arrival_delay_val in range(3): # arrival delay can be 0, 1 or 2
    max_prob = 0
    max_val = -1
    prob = 1
    for variable in feature_variables:
      value = str(row[variable])
      total = 0
      count = 0
      for assignment in variable_to_counts["ARRIVAL_DELAY"]:
        if assignment[var_to_index[variable]] == value:
          total += 1
          if assignment[var_to_index['ARRIVAL_DELAY']] == str(arrival_delay_val):
            count += 1
      prob *= float(count)/total
    if prob > max_prob:
      max_prob = prob
      max_val = arrival_delay_val
  guessed_labels.append(max_val)

KeyboardInterrupt: 

In [19]:
# test 2
test_data = pd.read_csv(BytesIO(data))
true_labels = test_data[['ARRIVAL_DELAY']].values
feature_variables = ['MONTH', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'AIRLINE', 'DESTINATION_AIRPORT']
guessed_labels = []
for index, row in bayes_data.iterrows():
  for arrival_delay_val in range(3): # arrival delay can be 0, 1 or 2
    prob = 1
    max_prob = 0
    max_val = -1
    parent_list = variables_to_parents['ARRIVAL_DELAY']
    value = str(float(arrival_delay_val))
    value = [value]
    for parent in parent_list:
        value.append(str(row[parent]))
    value = tuple(value)
    prob = variable_to_probability[variable][value]
    if prob > max_prob:
      max_prob = prob
      max_val = arrival_delay_val
  guessed_labels.append(max_val)

KeyError: ('0.0', '8', '5', '13', 'EV', 'ROA')

In [None]:
#test
test_data = pd.read_csv(BytesIO(data))
true_labels = test_data[['ARRIVAL_DELAY']].values
feature_variables = ['MONTH', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'AIRLINE', 'DESTINATION_AIRPORT']
guessed_labels = []
for index, row in bayes_data.iterrows():
  for arrival_delay_val in range(3): # arrival delay can be 0, 1 or 2
    prob = 1
    max_prob = 0
    max_val = -1
    for variable in feature_variables:
      parent_list = variables_to_parents[variable]
      if variable != 'ARRIVAL_DELAY':
        value = str(row[variable])
      else:
        value = str(float(arrival_delay_val))
      if parent_list != []:
        value = [value]
        for parent in parent_list:
          value.append(str(row[parent]))
        value = tuple(value)
      prob *= variable_to_probability[variable][value]
    if prob > max_prob:
      max_prob = prob
      max_val = arrival_delay_val
  guessed_labels.append(max_val)

In [None]:
# lr_train_acc = metrics.accuracy_score(train_y, lr.predict(train_x))
from sklearn import metrics
from sklearn.metrics import confusion_matrix
true_labels = [int(lst[0]) for lst in true_labels]
lr_test_acc = metrics.accuracy_score(true_labels, guessed_labels)
pred_y_test = [1 if guessed_labels[i] > 1 else 0 for i in range(len(guessed_labels))]
test_y_binarized = [1 if true_labels[i] > 1 else 0 for i in range(len(true_labels))]
print (len(test_y_binarized))
print ('baseline test acc: ' + str(lr_test_acc))
print ('baseline precision: ' + str(metrics.precision_score(test_y_binarized, pred_y_test)))

In [None]:
print(true_labels[:5])
print(guessed_labels[:5])

In [20]:
import cPickle as pickle

json = pickle.dumps(variable_to_probability)
bucket.item('flat-bayes.txt').write_to(json,'text/txt')