In [1]:
from library.load.load import Load
from library.features.features import Polynomial, Standardize, One_Hot_Enc
from library.model.model import Model
from library.preprocessor.preprocessor import PreProcessor
from library.metrics.metrics import Metrics
from library.split.split import Split

In [2]:
#Loading Data

df = Load('flights_data.csv')

In [42]:
subset = ['flight', 'price']

# Cleaning Data
df = PreProcessor(df.df)
df.clean(subset) # dropping any NAs in flight or price
df_clean = df.clean

In [48]:
subset_means = ['duration','days_left']

means = df_clean.groupby('flight')[subset_means].mean()

for col in means:
    df_clean[col] = df_clean.apply(lambda x: means[x.flight] if pd.isnull(x[col]) else x[col], axis=1)

In [50]:
subset_modes = ['airline','source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

modes = df_clean.groupby('flight')[subset_modes].agg(pd.Series.mode)

for col in modes:
    df_clean[col] = df_clean.apply(lambda x: modes[x.flight] if pd.isnull(x[col]) else x[col], axis=1)

In [None]:
# Creating features

# We don't want to create features for 'price' so we save it before dropping it
y = df_clean['price']
df_feat = df_clean.drop(['Unnamed: 0', 'price'], axis = 1)

# Creating polynomial features
col_names = ['duration', 'days_left']
poly = df_feat[col_names]
poly = Polynomial(poly)
poly = poly.transform(2).drop('1', axis = 1)
col_names2 = ['duration', 'days_left', 'duration^2','duration days_left', 'days_left^2']
df_feat[col_names2] = poly

# We only want to standardize the numerical columns, excluding 'price'
features = df_feat[col_names2]
scaler = Standardize(features)
features = scaler.transform()
df_feat[col_names2] = features

# One-Hot Encoding our categorical variables
df_feat = One_Hot_Enc(df_feat).transform()

In [None]:
# Getting our target variable back to split the data
df_feat['price'] = y

# Splitting between test and train samples
df = Split(df_feat)
test = df.test
train = df.train

In [None]:
# Declaring our X matrices 
X_train = train.drop('price', axis = 1)
X_test = test.drop('price', axis = 1)

# Declaring our target variable
y_train = train['price']
y_test = test['price']

In [None]:
# Training the model
model = Model(X_train, y_train)
model.train(reg = "l2")
y_pred_train = model.predict(X_train)
prediction = model.predict(X_test)
test['predictions'] = prediction

In [None]:
# Scoring 
print('Test Score')
score = Metrics.mse(y_test, test['predictions'])

print('Test Score is:' + score)