In [1]:
from src.library.load.load import Load
from src.library.features.features import Polynomial, Standardize, One_Hot_Enc
from src.library.model.model import Model
from src.library.preprocessor.preprocessor import PreProcessor
from src.library.metrics.metrics import Metrics
from src.library.split.split import Split

In [2]:
#Loading Data

df = Load('flights_data.csv')

In [3]:
subset = ['flight', 'price']

# Cleaning Data
df = PreProcessor(df.df)
df.clean(subset) # dropping any NAs in flight or price
df_clean = df.clean

In [4]:
subset_mean = ['duration','days_left']

# Fill NAs for numerical variables with mean by flight
cmeans = PreProcessor(df_clean)
cmeans.conditional_means(df_clean, subset_mean)
data_clean = cmeans.fill_means

In [5]:
subset_mode = ['airline','source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

# Fill NAs for categorical variables with mode by flight
cmodes = PreProcessor(data_clean)
cmodes.conditional_modes(df_clean, subset_mode)
data_clean = cmodes.fill_modes

In [6]:
# Creating features

# We don't want to create features for 'price' so we save it before dropping it
y = df_clean['price']
df_feat = df_clean.drop(['Unnamed: 0', 'price'], axis = 1)

# Creating polynomial features
col_names = ['duration', 'days_left']
poly = df_feat[col_names]
poly = Polynomial(poly)
poly = poly.transform(2).drop('1', axis = 1)
col_names2 = ['duration', 'days_left', 'duration^2','duration days_left', 'days_left^2']
df_feat[col_names2] = poly

# We only want to standardize the numerical columns, excluding 'price'
features = df_feat[col_names2]
scaler = Standardize(features)
features = scaler.transform()
df_feat[col_names2] = features

In [7]:
# One-Hot Encoding our categorical variables
df_feat = One_Hot_Enc(df_feat).transform()

In [8]:
# Getting our target variable back to split the data
df_feat['price'] = y

# Splitting between test and train samples
df = Split(df_feat)
test = df.test
train = df.train

In [9]:
# Declaring our X matrices 
X_train = train.drop('price', axis = 1)
X_test = test.drop('price', axis = 1)

# Declaring our target variable
y_train = train['price']
y_test = test['price']

In [10]:
# Training the model
model = Model(X_train, y_train)
model.train_rf()
y_pred_train = model.predict(X_train)
prediction = model.predict(X_test)
test['predictions'] = prediction

In [11]:
# Scoring 

score = Metrics.mse(y_test, test['predictions'])

print('Test Score is: ' + str(score))

Test Score is: 25206.017109947647
