In [70]:
import datalab.storage as storage
import pandas as pd
from io import BytesIO
import numpy as np
import random
import json

In [16]:
bucket = storage.Bucket('cs221-flight-data')
flights_data = bucket.item('model-train.csv')
uri = flights_data.uri
%gcs read --object $uri --variable data
bayes_data = pd.read_csv(BytesIO(data))

In [17]:
# CONSTANTS
NUM_TIME_DELAY_BUCKETS = 5
NUM_AIRPLANE_DELAY_BUCKETS = 5
la_place_constant = 1

# map the variables to their parents
variables_to_parents = {}
# variables_to_parents['MONTH'] = []
# variables_to_parents['DAY_OF_WEEK'] = []
# variables_to_parents['SCHEDULED_DEPARTURE'] = []
# variables_to_parents['time_delay'] = ['MONTH', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE']
variables_to_parents['AIRLINE'] = []
variables_to_parents['DESTINATION_AIRPORT'] = []
variables_to_parents['YEAR'] = [] # year is the year of the airplane make NOT the year that the flight departs
variables_to_parents['MODEL'] = []
variables_to_parents['airplane_delay'] = ['YEAR', 'MODEL']
# variables_to_parents['ARRIVAL_DELAY'] = ['time_delay', 'AIRLINE', 'DESTINATION_AIRPORT', 'airplane_delay']
variables_to_parents['ARRIVAL_DELAY'] = ['airplane_delay', 'AIRLINE', 'DESTINATION_AIRPORT']


In [18]:
# initialize the counts
variable_to_counts = {}
# these variables don't have parent variables
for var in variables_to_parents:
    if len(variables_to_parents[var]) == 0:
        variable_to_counts[var] = {key:la_place_constant for key in np.unique(bayes_data[var].astype(str).unique())}

variable_to_counts['airplane_delay'] = {(airplane_delay, year, model):random.randint(1,3)
                                       for airplane_delay in range(NUM_AIRPLANE_DELAY_BUCKETS)
                                       for year in bayes_data['YEAR'].astype(str).unique()
                                       for model in bayes_data['MODEL'].astype(str).unique()}

In [19]:
# add the variable with parent variables
# make sure that the order of the tuple is the same order as the parent list
#variable_to_counts['time_delay'] = {(time_delay, month, day_of_week, sch_departure):random.randint(1,3)
 #                                   for time_delay in range(NUM_TIME_DELAY_BUCKETS)
  #                                  for month in bayes_data['MONTH'].astype(str).unique()
   #                                 for day_of_week in bayes_data['DAY_OF_WEEK'].astype(str).unique()
    #                                for sch_departure in bayes_data['SCHEDULED_DEPARTURE'].astype(str).unique()}

In [20]:
# add the variable with parent variables
# make sure that the order of the tuple is the same order as the parent list
variable_to_counts['ARRIVAL_DELAY'] = {(arrival_delay, airplane_delay, airline, dest_airport):la_place_constant
                                      for arrival_delay in bayes_data['ARRIVAL_DELAY'].astype(str).unique()
                                      for airplane_delay in range(NUM_AIRPLANE_DELAY_BUCKETS)
                                      for airline in bayes_data['AIRLINE'].astype(str).unique()
                                      for dest_airport in bayes_data['DESTINATION_AIRPORT'].astype(str).unique()}
                                      # for airplane_delay in range(NUM_AIRPLANE_DELAY_BUCKETS)}

In [21]:
# count to get the probabilities for the quantities that we DO know
for index, row in bayes_data.iterrows():
    for (variable, parent_list) in variables_to_parents.items():
        if variable != "airplane_delay":
          # need to make tuple of value with their parent(conditional) values
          value = str(row[variable])
          if parent_list != []:
            value = [value]
            for parent in parent_list:
              if parent != "airplane_delay":
                value.append(str(row[parent]))
              elif parent == "airplane_delay":
                value.append(random.randint(0, NUM_AIRPLANE_DELAY_BUCKETS-1))
            value = tuple(value)
          # increase the count
          variable_to_counts[variable][value] += 1

In [22]:
# once the counts are initialized, need to normalize the probabilities 
def normalize(variables_to_parents, variable_to_counts):
  variable_to_probability = {}
  for variable in variable_to_counts:
      variable_to_probability[variable] = {}
      if len(variables_to_parents[variable]) > 0:
          # then we need to do some fun tuple stuff because the conditionals are what is constant
          conditional_to_count = {}
          for value in variable_to_counts[variable]:
              conditional = value[1:]
              conditional_to_count[conditional] = conditional_to_count.get(conditional, 0) + variable_to_counts[variable][value]
          # now actually get the percentage
          for value in variable_to_counts[variable]:
              conditional = value[1:]
              variable_to_probability[variable][value] = float(variable_to_counts[variable][value])/conditional_to_count[conditional]
      else:
          # there are no parents
          for value in variable_to_counts[variable]:
              # add all of the counts for all of the values for the variable
              total = sum(variable_to_counts[variable].values())
              variable_to_probability[variable][value] = float(variable_to_counts[variable][value])/total
  return variable_to_probability

In [23]:
variable_to_probability = normalize(variables_to_parents, variable_to_counts)

In [24]:
# E-step
def e_step(data, var_to_prob, variables_to_parents):
  assignment_to_curr_prob = {}
  known_var_to_total = {}
  variables_sorted = sorted(variables_to_parents.keys())
  airplane_delay_var_index = variables_sorted.index("airplane_delay")
  for index, row in data.iterrows():
    # consider all possible values of the hidden variable
    for airplane_delay_val in range(NUM_AIRPLANE_DELAY_BUCKETS):
      # determine joint probability
      prob = 1
      assignment = []
      # sort the (variable, parent_list) tuples by variable name, this ensures that assigments are always in the same order
      for (variable, parent_list) in sorted(variables_to_parents.items(), key=lambda tup: tup[0]):
        if variable != "airplane_delay":
          value = str(row[variable])
        elif variable == 'airplane_delay':
          value = airplane_delay_val
        assignment.append(value)
        if parent_list != []:
          value = [value]
          for parent in parent_list:
              if parent != "airplane_delay":
                value.append(str(row[parent]))
              elif parent == "airplane_delay":
                value.append(airplane_delay_val)
          value = tuple(value)
        prob *= var_to_prob[variable][value]
      # add the joint probability to the map of assigments
      known_vars = assignment[:airplane_delay_var_index]
      known_vars.extend(assignment[airplane_delay_var_index+1:])
      known_vars = tuple(known_vars)
      known_var_to_total[known_vars] = known_var_to_total.get(known_vars, 0) + prob
      assignment = tuple(assignment)
      assignment_to_curr_prob[assignment] = prob
  # now I need to normalize the probabilities
  assigment_to_norm_prob = {}
  for (assignment, prob) in assignment_to_curr_prob.items():
    known_vars = list(assignment[:airplane_delay_var_index])
    known_vars.extend(assignment[airplane_delay_var_index+1:])
    known_vars = tuple(known_vars)
    total = known_var_to_total[known_vars] 
    assigment_to_norm_prob[assignment] = float(prob) / total
  return assigment_to_norm_prob

In [25]:
def initialize_counts(variables_to_parents, smoothing_constant, bayes_data):
  variable_to_counts = {}
  for var in variables_to_parents:
    if len(variables_to_parents[var]) == 0:
        variable_to_counts[var] = {key:smoothing_constant for key in np.unique(bayes_data[var].astype(str).unique())}
 # variable_to_counts['time_delay'] = {(time_delay, month, day_of_week, sch_departure):smoothing_constant
  #                                  for time_delay in range(NUM_TIME_DELAY_BUCKETS)
   #                                 for month in bayes_data['MONTH'].astype(str).unique()
    #                                for day_of_week in bayes_data['DAY_OF_WEEK'].astype(str).unique()
     #                               for sch_departure in bayes_data['SCHEDULED_DEPARTURE'].astype(str).unique()}
  variable_to_counts['airplane_delay'] = {(airplane_delay, year, model):smoothing_constant
                                       for airplane_delay in range(NUM_AIRPLANE_DELAY_BUCKETS)
                                       for year in bayes_data['YEAR'].astype(str).unique()
                                       for model in bayes_data['MODEL'].astype(str).unique()}
  variable_to_counts['ARRIVAL_DELAY'] = {(arrival_delay, airplane_delay, airline, dest_airport):smoothing_constant
                                      for arrival_delay in bayes_data['ARRIVAL_DELAY'].astype(str).unique()
                                      for airplane_delay in range(NUM_AIRPLANE_DELAY_BUCKETS)
                                      for airline in bayes_data['AIRLINE'].astype(str).unique()
                                      for dest_airport in bayes_data['DESTINATION_AIRPORT'].astype(str).unique()}
                                      # for airplane_delay in range(NUM_AIRPLANE_DELAY_BUCKETS)}
  return variable_to_counts

In [26]:
def m_step(assignment_to_prob, variables_to_parents, smoothing_constant, bayes_data):
  variables_sorted = sorted(variables_to_parents.keys())
  # need way of translating assignment index to variable value
  variable_to_assigment_index = {variables_sorted[i]:i for i in range(len(variables_sorted))}
  # initialize the new counts map
  variable_to_counts = initialize_counts(variables_to_parents,smoothing_constant, bayes_data)
  # for every assignment, add the value of its variables to the counts
  for (assignment, weight) in assignment_to_prob.items():
    for (variable, parent_list) in variables_to_parents.items():
      value = assignment[variable_to_assigment_index[variable]]
      if parent_list != []:
        value = [value]
        for parent in parent_list:
           value.append(assignment[variable_to_assigment_index[parent]])
        value = tuple(value)
      variable_to_counts[variable][value] += weight
  # normalize
  variable_to_prob = normalize(variables_to_parents, variable_to_counts)
  return variable_to_prob

In [27]:
def EM_learning(bayes_data, variable_to_probability, variable_to_parents, epsilon):
  greaterThanEpsilon = True
  iteration = 0
  old_var_to_prob = variable_to_probability
  while greaterThanEpsilon and iteration < 50:
    assigment_to_norm_prob = e_step(bayes_data, old_var_to_prob, variable_to_parents)
    updated_var_to_prob = m_step(assigment_to_norm_prob, variable_to_parents, .0001, bayes_data)
    # determine if we've converged
    # for us, convergence happens if all of the unknown thetas change by less than epsilon
    # right now, unknown thetas are probabilities of airplane_delay
    greaterThanEpsilon = False
    for value in old_var_to_prob['airplane_delay']:
      diff = abs(old_var_to_prob['airplane_delay'][value] - updated_var_to_prob['airplane_delay'][value])
      if diff > epsilon:
        greaterThanEpsilon = True
        break
    iteration += 1
    print('iteration: {} diff: {}'.format(iteration, diff))
    # print(updated_var_to_prob['airplane_delay'][(1, '7', '1', '13')])
    old_var_to_prob = updated_var_to_prob
  return updated_var_to_prob

In [28]:
final_var_to_prob = EM_learning(bayes_data, variable_to_probability, variables_to_parents, 0.0015)

iteration: 1 diff: 0.0333333333333
iteration: 2 diff: 0.00688405302904
iteration: 3 diff: 0.0115857843951
iteration: 4 diff: 0.0154422039364
iteration: 5 diff: 0.0181108223232
iteration: 6 diff: 0.0191797935813
iteration: 7 diff: 0.0184238501325
iteration: 8 diff: 0.0160723201501
iteration: 9 diff: 0.0128311415751
iteration: 10 diff: 0.00954714367454
iteration: 11 diff: 0.00679893193202
iteration: 12 diff: 0.00476370496038
iteration: 13 diff: 0.00335416828089
iteration: 14 diff: 0.0024004242348
iteration: 15 diff: 0.00175058754016
iteration: 16 diff: 0.0267188721701
iteration: 17 diff: 0.025043117341
iteration: 18 diff: 0.0235134105788
iteration: 19 diff: 0.0221415098463
iteration: 20 diff: 0.0208970110396
iteration: 21 diff: 0.0197361339986
iteration: 22 diff: 0.0186191193018
iteration: 23 diff: 0.0175197245567
iteration: 24 diff: 0.0164276665367
iteration: 25 diff: 0.0153451099256
iteration: 26 diff: 0.0142813788297
iteration: 27 diff: 0.0132485420676
iteration: 28 diff: 0.0122573520

In [29]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [30]:
flights_data_test = bucket.item('model-test.csv')
uri = flights_data_test.uri
%gcs read --object $uri --variable data
test_data = pd.read_csv(BytesIO(data))
true_labels = test_data[['ARRIVAL_DELAY']].values

In [67]:
guessed_labels = []
for index, row in test_data.iterrows():
  prob = 1
  max_prob = 0
  max_val = -1
  for airplane_delay_val in range(NUM_AIRPLANE_DELAY_BUCKETS):
    parent_list = variables_to_parents['airplane_delay']
    value = airplane_delay_val
    value = [value]
    for parent in parent_list:
      value.append(str(row[parent]))
    value = tuple(value)
    prob = final_var_to_prob['airplane_delay'].get(value,.000000001)
    if prob > max_prob:
      max_prob = prob
      max_val = arrival_delay_val
  airplane_delay_val = max_val
  max_prob = 0
  max_val = -1
  for arrival_delay_val in range(3): # arrival delay can be 0, 1 or 2
    parent_list = variables_to_parents['ARRIVAL_DELAY']
    value = str(float(arrival_delay_val))
    value = [value]
    for parent in parent_list:
      if parent == 'airplane_delay':
        parent = value.append(airplane_delay_val)
      else:
        value.append(str(row[parent]))
    value = tuple(value)
    prob = final_var_to_prob['ARRIVAL_DELAY'].get(value,.000000001)
    if prob > max_prob:
      max_prob = prob
      max_val = arrival_delay_val
 #   print(prob)
 #   print(value)
  guessed_labels.append(max_val)
#  if len(guessed_labels) > 3:
 #   break

In [68]:
lr_test_acc = metrics.accuracy_score(true_labels, guessed_labels)
pred_y_test = [1 if guessed_labels[i] > 1 else 0 for i in range(len(guessed_labels))]
test_y_binarized = [1 if true_labels[i][0] > 1 else 0 for i in range(len(true_labels))]
print (len(test_y_binarized))
print ('baseline test acc: ' + str(lr_test_acc))
print ('baseline precision: ' + str(metrics.precision_score(test_y_binarized, pred_y_test)))

17700
baseline test acc: 0.3241242937853107
baseline precision: 0.3369404820462371


In [69]:
print(true_labels[:5])
print(guessed_labels[:5])

[[0.]
 [1.]
 [1.]
 [0.]
 [2.]]
[2, 0, 2, 2, 2]


In [73]:
import cPickle as pickle

json = pickle.dumps(final_var_to_prob)
bucket.item('hidden_airplane_var.txt').write_to(json,'text/txt')