In [130]:
INPUT_TRAIN = "./all/train.csv"
INPUT_TEST = "./all/test.csv"

TRAIN='train-processed.csv'
TEST='test-processed.csv'
Y='y.csv'

In [132]:
preprocess()

Loading ./all/train.csv
Loaded train.csv. Shape: (903653, 55)
Loading ./all/test.csv
Loaded test.csv. Shape: (804684, 53)
Processing dfs...
Dropping repeated columns...
Generating date columns...
Encoding columns with pd.factorize()
Splitting back...


In [139]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

def load_preprocessed_dfs(drop_full_visitor_id=True):
    """
    Loads files `TRAIN`, `TEST` and `Y` generated by preprocess() into variables
    """
    X_train = pd.read_csv(TRAIN, converters={'fullVisitorId': str})
    X_test = pd.read_csv(TEST, converters={'fullVisitorId': str})
    y_train = pd.read_csv(Y, names=['LogRevenue']).T.squeeze()
    
    # This is the only `object` column, we drop it for train and evaluation
    if drop_full_visitor_id: 
        X_train = X_train.drop(['fullVisitorId'], axis=1)
        X_test = X_test.drop(['fullVisitorId'], axis=1)
    return X_train, y_train, X_test

X, y, X_test = load_preprocessed_dfs()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test (submit) shape: {X_test.shape}")

def run_xgb(X_train, y_train, X_val, y_val):
    params = {}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)

    model = xgb.train({}, DMatrix(X_train, y_train))

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)

    print(f"XGB : RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

Train shape: (768105, 31)
Validation shape: (135548, 31)
Test (submit) shape: (804684, 31)


In [141]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_lr(X_train, y_train, X_val, y_val):    
    model = LinearRegression()
    print('Training model: {0}...'.format('lr'))
    model.fit(X_train, y_train)

    test_predictions = model.predict(X_val)
    print('Mean Squared Error (Training):', mean_squared_error(y_val, test_predictions))
    print('\nCOEFFICIENTS:')
    for i in list(zip(X_test.columns, model.coef_)):
        print('{0}: {1}'.format(i[0], i[1]))
    print('==============================')
    print('\nINTERCEPT:', model.intercept_)
    
    return model

In [143]:
X, y, X_test = load_preprocessed_dfs()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1)
model = train_lr(X_train, y_train, X_val, y_val)

Training model: lr...
Mean Squared Error (Training): 3.367116452727918

COEFFICIENTS:
channelGrouping: 0.0026374585999358064
visitNumber: 0.00018729966395682098
device.browser: -0.004863842275405668
device.deviceCategory: -0.026196572440011327
device.isMobile: -0.21558286380397315
device.operatingSystem: 0.03469199900278587
geoNetwork.city: -0.000584190115560241
geoNetwork.continent: 0.02895565342753322
geoNetwork.country: 0.00014424673871882922
geoNetwork.metro: 0.01024111687525408
geoNetwork.networkDomain: -4.823996153354428e-06
geoNetwork.region: 0.0007695732759410102
geoNetwork.subContinent: 0.003254177583360256
totals.hits: -0.0014123616641848083
totals.pageviews: 0.11170290392526085
trafficSource.adContent: 0.012888581830382452
trafficSource.adwordsClickInfo.adNetworkType: -0.07504685217335817
trafficSource.adwordsClickInfo.gclId: -1.170306797879344e-05
trafficSource.adwordsClickInfo.page: -0.036369900000716625
trafficSource.adwordsClickInfo.slot: 0.1601493783560182
trafficSource