In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/FA_NNumber_Scrape.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
# import match tables
mfr_match = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/MFR_Match.csv')
model_match = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/Model_Match.csv')

In [4]:
# merge match tables
data = pd.merge(data, mfr_match, left_on= 'mfr', right_on='MFR', how='left')
data = pd.merge(data, model_match, on='model', how='left')
data = data.drop_duplicates()

In [5]:
# remove useless columns and rename columns
data = data[['N-NUMBER', 'New_MFR', 'New_Model', 'mfr_year', 'seats']]
data.columns = ['N-NUMBER', 'mfr', 'Model', 'mfr_year', 'seats']
# Remove rows without mfr or model
data.dropna(subset=['mfr', 'Model', 'seats'], inplace=True)

In [6]:
data.isnull().sum()

N-NUMBER      0
mfr           0
Model         0
mfr_year    138
seats         0
dtype: int64

In [7]:
# Calculate average year for each mfr/model
data_full = data[data['mfr_year'].isnull() == False]
data_full['mfr_year'] = data_full['mfr_year'].astype(int)
avgs = data_full.groupby(['mfr', 'Model'])['mfr_year'].agg('mean')
avgs = pd.DataFrame(avgs).reset_index()
avgs['mfr_year'] = avgs['mfr_year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [10]:
# Fill missing years
needs_year = data[data['mfr_year'].isnull()]
needs_year = pd.merge(needs_year, avgs, on=['mfr', 'Model'], how='left')
needs_year = needs_year[['N-NUMBER', 'mfr', 'Model', 'mfr_year_y', 'seats']]
needs_year.columns = ['N-NUMBER', 'mfr', 'Model', 'mfr_year', 'seats']
still_needs_year = needs_year[needs_year['mfr_year'].isnull()]
has_avg_year = needs_year[needs_year['mfr_year'].isnull() == False]

In [11]:
# fill missing years with avg years. then drop missing year rows
data.dropna(inplace=True)
data = pd.concat([data, has_avg_year])

In [30]:
# if date still missing, adds year based on production dates for plane
# https://en.wikipedia.org/wiki/McDonnell_Douglas_MD-90
# https://en.wikipedia.org/wiki/Embraer_ERJ_145_family
needs_year_again = pd.DataFrame(
    still_needs_year.groupby(['mfr', 'Model'])['seats'].count()).reset_index().loc[:, ['mfr', 'Model']]

def fill_year(model):
    if model == 'MD-90-30':
        return 1997
    elif model == 'EMB-145LR':
        return 2003
    else:
        None

needs_year_again['mfr_year'] = 0
needs_year_again['mfr_year'] = needs_year_again['Model'].apply(lambda x: fill_year(x))

still_needs_year = pd.merge(still_needs_year, needs_year_again, on=['mfr', 'Model'], how='left')
still_needs_year = still_needs_year[['N-NUMBER', 'mfr', 'Model', 'mfr_year_y', 'seats']]
still_needs_year.columns = ['N-NUMBER', 'mfr', 'Model', 'mfr_year', 'seats']
finally_has_year = still_needs_year[still_needs_year['mfr_year'].isnull() == False]

In [34]:
# join years to data
data = pd.concat([data, finally_has_year])

In [42]:
data.to_csv('../Assets/Datasets/FlightTracker_Airplane/FA_Airplane_Info_Clean.csv')