In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#read the data 
X = pd.read_csv('./input/renfe.csv')

#drop missing price rows 
X.dropna(axis=0, subset=['price'], inplace=True)
y = X.price
X.drop(['price'], axis=1, inplace=True)
X.drop(['insert_date'], axis=1, inplace=True)

#clean the data
for col in ['start_date', 'end_date']:
    date_col = pd.to_datetime(X[col])
    X[col] = date_col
    X[col + '_hour'] = date_col.dt.hour
    X[col + '_minute'] = date_col.dt.minute
    X[col + '_second'] = date_col.dt.second
    X[col + '_weekday'] = date_col.dt.weekday_name
    X[col + '_day'] = date_col.dt.day
    X[col + '_month'] = date_col.dt.month
    X[col + '_year'] = date_col.dt.year

In [2]:
X['duration'] = X['end_date'] - X['start_date']
X['duration'] = X['duration'].apply(lambda x: x.total_seconds())

for col in ['start_date', 'end_date', 'Unnamed: 0', 'duration']:
    del X[col]

In [3]:
#divide into train and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from lightgbm import LGBMRegressor
from sklearn.externals import joblib

def test(model, train, valid, train_y, valid_y):
    model.fit(train, train_y)
    preds = model.predict(valid)
    return mean_absolute_error(valid_y, preds)

def create_model(model, train, train_y, file_name):
    model.fit(train, train_y)
    joblib.dump(model, file_name)
    pass



In [5]:
from sklearn.preprocessing import OneHotEncoder

#encoded categorical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_valid = X_valid.drop(categorical_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [6]:
model_linear_regression = LinearRegression()
print("MAE of a Linear Regression is {}".format(test(model_linear_regression, OH_X_train, OH_X_valid, y_train, y_valid)))

MAE of a Linear Regression is 7.469592075772801


In [7]:
model_LGBMRegressor = LGBMRegressor()
print("MAE of a LGBMRegressor is {}".format(test(model_LGBMRegressor, OH_X_train, OH_X_valid, y_train, y_valid)))

MAE of a LGBMRegressor is 4.201617434414411
