In [1]:
import pandas as pd
import numpy as np
from glob import glob
import xgboost as xgb
from tqdm import tqdm

In [2]:
data = pd.DataFrame()

In [3]:
csv_paths = list(glob("/media/xview/xview3_challenge/dataset/ensemble/*/*.csv"))

In [4]:
for csv in tqdm(csv_paths[:30]):
    data = pd.concat([data, pd.read_csv(csv)])

100%|███████████████████████████████████████████| 30/30 [00:52<00:00,  1.73s/it]


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
data = data.values

In [11]:
np.unique(data[:, -1], return_counts=True)

(array([0., 1., 2., 3.]), array([1446511,  282553,  506480,  242158]))

In [12]:
X, y = data[:, :-1], data[:, -1]
del data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1660060, 129), (817642, 129), (1660060,), (817642,))

In [25]:
scaler = StandardScaler()

scaler_fit = scaler.fit(X_train)

In [26]:
X_train = scaler_fit.transform(X_train)
X_test = scaler_fit.transform(X_test)

In [27]:
xgb_clf = xgb.XGBClassifier(objective="multi:softmax", early_stopping_rounds=10, use_label_encoder=False)

In [28]:
xgb_model = xgb_clf.fit(X_train, y_train.astype("int32"), eval_set=[(X_test, y_test.astype("int32"))])

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.02055
[1]	validation_0-mlogloss:0.81844
[2]	validation_0-mlogloss:0.68382
[3]	validation_0-mlogloss:0.58829
[4]	validation_0-mlogloss:0.51860
[5]	validation_0-mlogloss:0.46758
[6]	validation_0-mlogloss:0.42842
[7]	validation_0-mlogloss:0.39619
[8]	validation_0-mlogloss:0.36974
[9]	validation_0-mlogloss:0.34989
[10]	validation_0-mlogloss:0.33287
[11]	validation_0-mlogloss:0.31884
[12]	validation_0-mlogloss:0.30679
[13]	validation_0-mlogloss:0.29664
[14]	validation_0-mlogloss:0.28785
[15]	validation_0-mlogloss:0.28046
[16]	validation_0-mlogloss:0.27389
[17]	validation_0-mlogloss:0.26788
[18]	validation_0-mlogloss:0.26302
[19]	validation_0

In [35]:
y_pred = xgb_model.predict(X_test)

In [36]:
np.mean(y_test == y_pred)

0.9750844501627852

In [39]:
for csv in csv_paths[30:60]:
    d = pd.read_csv(csv).values
    d_x, d_y = d[:, :-1], d[:, -1].astype("int32")
    d_x = scaler_fit.transform(d_x)
    print("Accuracy:", np.mean(xgb_model.predict(d_x) == d_y))
    
    

Accuracy: 0.7565780184644534
Accuracy: 0.8858217929322316
Accuracy: 0.783219101183666
Accuracy: 0.8701055609968441
Accuracy: 0.8857957102144893
Accuracy: 0.8669429546562653
Accuracy: 0.8860086076352913
Accuracy: 0.8727051130757773
Accuracy: 0.8938573927358782
Accuracy: 0.8069556198665083
Accuracy: 0.8815439791521532
Accuracy: 0.8295936600602895
Accuracy: 0.8798096089493458
Accuracy: 0.8793865058187771
Accuracy: 0.8563305403062699
Accuracy: 0.760644285525096
Accuracy: 0.870189578839823
Accuracy: 0.9016986911723753
Accuracy: 0.832598601836394
Accuracy: 0.884834088040977
Accuracy: 0.8042830286352615
Accuracy: 0.8772786152777517
Accuracy: 0.8535766402674865
Accuracy: 0.8077870167254818
Accuracy: 0.8748189863234112
Accuracy: 0.8798216379682775
Accuracy: 0.8934864927756976
Accuracy: 0.7538892722241587
Accuracy: 0.923423944476576
Accuracy: 0.7718216514873858


In [44]:
from joblib import dump

In [45]:
dump(scaler_fit, "scaler.joblib")

['scaler.joblib']

In [47]:
xgb_model.save_model("xgb_model.json")