# Notebook for XGBoost testing

## Getting data

In [104]:
%store -r dm 

from sklearn.model_selection import train_test_split

dm = dm

X_A_train = dm.data_A.iloc[:, 2:-1]
y_A_train = dm.data_A.iloc[:,0]
X_B_train = dm.data_B.iloc[:, 2:-1]
y_B_train = dm.data_B.iloc[:,0]
X_C_train = dm.data_C.iloc[:, 2:-1]
y_C_train = dm.data_C.iloc[:,0]

X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A_train, y_A_train, shuffle=True)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B_train, y_B_train, shuffle=True)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C_train, y_C_train, shuffle=True)

X_A_submission = dm.X_test_estimated_a[dm.X_test_estimated_a.columns.intersection(X_A_train.columns)]
X_B_submission = dm.X_test_estimated_b[dm.X_test_estimated_b.columns.intersection(X_B_train.columns)]
X_C_submission = dm.X_test_estimated_c[dm.X_test_estimated_c.columns.intersection(X_C_train.columns)]

X_A_train.shape

(136242, 43)

## Using time series to split our data


In [105]:
from sklearn.model_selection import TimeSeriesSplit

ts_cv = TimeSeriesSplit(
    n_splits=10,
    gap=15,
    max_train_size=10000,
    test_size=1000,
)

all_splits_A = list(ts_cv.split(X_A_train, y_A_train))
all_splits_B = list(ts_cv.split(X_B_train, y_B_train))
all_splits_C = list(ts_cv.split(X_C_train, y_C_train))


## Splitting our data normally

In [106]:

X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_A_train, y_A_train, shuffle=False, test_size=0.1)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_B_train, y_B_train, shuffle=False, test_size=0.1)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C_train, y_C_train, shuffle=False, test_size=0.1)

X_A_submission = dm.X_test_estimated_a[dm.X_test_estimated_a.columns.intersection(X_A_train.columns)]
X_B_submission = dm.X_test_estimated_b[dm.X_test_estimated_b.columns.intersection(X_B_train.columns)]
X_C_submission = dm.X_test_estimated_c[dm.X_test_estimated_c.columns.intersection(X_C_train.columns)]

## Defining our model

In [110]:
from xgboost import XGBRegressor

xgb_A = XGBRegressor()
xgb_B = XGBRegressor()
xgb_C = XGBRegressor()


## Training our model on time series

In [90]:

from sklearn.metrics import mean_absolute_error
import pandas as pd

xgb_A.fit(X_A_train.iloc[all_splits_A[0][0]], y_A_train.iloc[all_splits_A[0][0]], eval_set=[(X_A_train.iloc[all_splits_A[0][1]], y_A_train.iloc[all_splits_A[0][1]])], verbose=0)
xgb_B.fit(X_B_train.iloc[all_splits_B[0][0]], y_B_train.iloc[all_splits_B[0][0]], eval_set=[(X_B_train.iloc[all_splits_B[0][1]], y_B_train.iloc[all_splits_B[0][1]])], verbose=0)
xgb_C.fit(X_C_train.iloc[all_splits_C[0][0]], y_C_train.iloc[all_splits_C[0][0]], eval_set=[(X_C_train.iloc[all_splits_C[0][1]], y_C_train.iloc[all_splits_C[0][1]])], verbose=0)


for train, test in (all_splits_A[1:]): 

    xgb_A.fit(X_A_train.iloc[train], y_A_train.iloc[train], eval_set=[(X_A_train.iloc[test], y_A_train.iloc[test])], xgb_model=xgb_A, verbose=0)

    preds_A = pd.DataFrame(xgb_A.predict(X_A_train.iloc[test]))
    actual = y_A_train.iloc[test]
    
    print("CURRENT SCORING A ----->   ", mean_absolute_error(actual, preds_A))

for train, test in (all_splits_B): 

    xgb_B.fit(X_B_train.iloc[train], y_B_train.iloc[train], eval_set=[(X_B_train.iloc[test], y_B_train.iloc[test])], xgb_model=xgb_B, verbose=0)

    preds_B = pd.DataFrame(xgb_B.predict(X_B_train.iloc[test]))
    actual = y_B_train.iloc[test]

    print("CURRENT SCORING B ----->   ", mean_absolute_error(actual, preds_B))

for train, test in (all_splits_C): 

    xgb_C.fit(X_C_train.iloc[train], y_C_train.iloc[train], eval_set=[(X_C_train.iloc[test], y_C_train.iloc[test])], xgb_model=xgb_C, verbose=0)

    preds_C = pd.DataFrame(xgb_C.predict(X_C_train.iloc[test]))
    actual = y_C_train.iloc[test]

    print("CURRENT SCORING C ----->   ", mean_absolute_error(actual, preds_C))



CURRENT SCORING A ----->    104.46614614170701
CURRENT SCORING A ----->    108.72018849105918
CURRENT SCORING A ----->    136.54284475366848
CURRENT SCORING A ----->    260.17068012964376
CURRENT SCORING A ----->    237.7066067611975
CURRENT SCORING A ----->    504.82350545372844
CURRENT SCORING A ----->    405.95321643406186
CURRENT SCORING A ----->    669.4257213162938
CURRENT SCORING A ----->    501.50392677930034
CURRENT SCORING B ----->    7.77049536560415
CURRENT SCORING B ----->    10.05257697996823
CURRENT SCORING B ----->    17.45583734098516
CURRENT SCORING B ----->    15.318454764519581
CURRENT SCORING B ----->    25.990018080952023
CURRENT SCORING B ----->    22.397877188516315
CURRENT SCORING B ----->    70.49825163632556
CURRENT SCORING B ----->    55.56585218903334
CURRENT SCORING B ----->    77.41648554553643
CURRENT SCORING B ----->    73.48194939061443
CURRENT SCORING C ----->    4.145165840937271
CURRENT SCORING C ----->    3.9330962650755557
CURRENT SCORING C ----->

## Training our model on normal train/test/split

In [108]:
xgb_A.fit(X_train_A, y_train_A, eval_set=[(X_test_A, y_test_A)], verbose=0)
xgb_B.fit(X_train_B, y_train_B, eval_set=[(X_test_B, y_test_B)], verbose=0)
xgb_C.fit(X_train_C, y_train_C, eval_set=[(X_test_C, y_test_C)], verbose=0)


## Evaluating using MAE

In [109]:
from sklearn.metrics import mean_absolute_error
import pandas as pd


preds_A = pd.DataFrame(xgb_A.predict(X_test_A))
actual = y_test_A

print(mean_absolute_error(actual, preds_A))

preds_B = pd.DataFrame(xgb_B.predict(X_test_B))
actual = y_test_B

print(mean_absolute_error(actual, preds_B))

preds_C = pd.DataFrame(xgb_C.predict(X_test_C))
actual = y_test_C

print(mean_absolute_error(actual, preds_C))



140.65942055361336
27.630429422420423
28.12795247065992


## Make preds

In [103]:
## Specify the model to be used
pred_A = xgb_A.predict(X_A_submission)
pred_B = xgb_B.predict(X_B_submission)
pred_C = xgb_C.predict(X_C_submission)
 
test = pd.read_csv('test.csv')

df_A = pd.DataFrame()

df_A["prediction"] = pred_A
df_A["location"] = "A"

df_B = pd.DataFrame()

df_B["prediction"] = pred_B
df_B["location"] = "B"

df_C = pd.DataFrame()

df_C["prediction"] = pred_C
df_C["location"] = "C"

df_mid = pd.concat([df_A, df_B], ignore_index=True)

df = pd.concat([df_mid, df_C], join="inner", ignore_index=True)



df = df.drop("location", axis=1)


# df["id"] = test["id"]

# df = df[["id", "prediction"]]

#df[df<0] = 0

# NAME THE FILE 
df.to_csv("sub33.csv")

df

Unnamed: 0,prediction
0,-0.898888
1,0.307747
2,5.864113
3,197.614258
4,429.093323
...,...
2155,30.773670
2156,7.342603
2157,-4.318516
2158,0.093121
