In [108]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
#Please enter data to predict a ticket price. To check accuracy, you may take data from https://venta.renfe.com/
#Example: MADRID,SEVILLA,2019-05-29 06:20:00,2019-05-29 09:16:00,   AV City,38.55,Turista,Promo 
#NOTE: the algorithm is case sensible, so please pay close attention to a letter case of entering data
X = [
    ['MADRID','SEVILLA','2019-09-17 08:00:00','2019-09-17 10:32:00',   'AVE',76.30,'Turista','Flexible']
]
X = pd.DataFrame(X, columns=['origin', 'destination', 'start_date', 'end_date', 'train_type', 'price', 'train_class', 'fare'])
X

Unnamed: 0,origin,destination,start_date,end_date,train_type,price,train_class,fare
0,MADRID,SEVILLA,2019-09-17 08:00:00,2019-09-17 10:32:00,AVE,76.3,Turista,Flexible


In [109]:
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df):
    df.dropna(axis=0, subset=['price'], inplace=True)
    y = df.price
    df.drop(['price'], axis=1, inplace=True)

    for col in ['start_date', 'end_date']:
        date_col = pd.to_datetime(df[col])
        df[col] = date_col
        df[col + '_hour'] = date_col.dt.hour
        df[col + '_minute'] = date_col.dt.minute
        df[col + '_second'] = date_col.dt.second
        df[col + '_weekday'] = date_col.dt.weekday_name
        df[col + '_day'] = date_col.dt.day
        df[col + '_month'] = date_col.dt.month
        df[col + '_year'] = date_col.dt.year

    df['duration'] = df['end_date'] - df['start_date']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds())

    for col in ['start_date', 'end_date']:
        del df[col]


    #encoded categorical columns
    OHE_file_path = 'OH_encoder.sav'
    categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
    OH_encoder = joblib.load(OHE_file_path)
    OH_object = pd.DataFrame(OH_encoder.transform(df[categorical_cols]))

    OH_object.index = df.index

    num_df = df.drop(categorical_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_df = pd.concat([num_df, OH_object], axis=1)
    return OH_df

In [110]:
from sklearn.metrics import mean_absolute_error

linear_regression_path = 'LinearRegression.sav'
loaded_model = joblib.load(linear_regression_path)

X = preprocess_data(X)

preds = loaded_model.predict(X)
print("MAE is {}".format(mean_absolute_error(y, preds)))

output = pd.DataFrame({'Id': X.index,
                       'SalePrice': preds})
output.to_csv('predicted_prices.csv', index=False)
X.to_csv('input.csv')

print(preds)

MAE is 8.393322753906247
[67.90667725]
