In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt
from sklearn.externals import joblib


In [2]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

# Gather some features

In [3]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [4]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("store.csv")

Load the training, test and store data using pandas


In [5]:
print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store
augment features
['Store', 'CompetitionDistance', 'Promo', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']


In [6]:
print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 400

training data processed


In [7]:
print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Train a XGBoost model


  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:5.79355	eval-rmse:5.79446	train-rmspe:0.996844	eval-rmspe:0.996845
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.0633	eval-rmse:4.06536	train-rmspe:0.981474	eval-rmspe:0.981497
[2]	train-rmse:2.8537	eval-rmse:2.85559	train-rmspe:0.937983	eval-rmspe:0.938041
[3]	train-rmse:2.01024	eval-rmse:2.01223	train-rmspe:0.856537	eval-rmspe:0.856573
[4]	train-rmse:1.42318	eval-rmse:1.42518	train-rmspe:0.74412	eval-rmspe:0.743763
[5]	train-rmse:1.01854	eval-rmse:1.02021	train-rmspe:0.619784	eval-rmspe:0.617985
[6]	train-rmse:0.742679	eval-rmse:0.744681	train-rmspe:0.505335	eval-rmspe:0.501458
[7]	train-rmse:0.557015	eval-rmse:0.55916	train-rmspe:0.415209	eval-rmspe:0.407433
[8]	train-rmse:0.436924	eval-rmse:0.439315	train-rmspe:0.354322	eval-rmspe:0.342618
[9]	train-rmse:0.354692	eval-rmse:0.357384	train-rmspe:0.316391	eval-rmspe:0.300593
[10]	train-rmse:0.310754	eval-

[96]	train-rmse:0.121255	eval-rmse:0.127638	train-rmspe:0.153014	eval-rmspe:0.137924
[97]	train-rmse:0.121186	eval-rmse:0.127589	train-rmspe:0.152974	eval-rmspe:0.137868
[98]	train-rmse:0.120941	eval-rmse:0.127477	train-rmspe:0.152702	eval-rmspe:0.137743
[99]	train-rmse:0.120562	eval-rmse:0.127117	train-rmspe:0.152345	eval-rmspe:0.137328
[100]	train-rmse:0.120498	eval-rmse:0.127073	train-rmspe:0.152284	eval-rmspe:0.137286
[101]	train-rmse:0.120138	eval-rmse:0.12685	train-rmspe:0.151914	eval-rmspe:0.137037
[102]	train-rmse:0.119931	eval-rmse:0.126738	train-rmspe:0.151665	eval-rmspe:0.136933
[103]	train-rmse:0.119766	eval-rmse:0.126654	train-rmspe:0.151511	eval-rmspe:0.136853
[104]	train-rmse:0.119452	eval-rmse:0.126271	train-rmspe:0.152852	eval-rmspe:0.136327
[105]	train-rmse:0.119157	eval-rmse:0.126095	train-rmspe:0.152517	eval-rmspe:0.136077
[106]	train-rmse:0.118816	eval-rmse:0.12578	train-rmspe:0.152169	eval-rmspe:0.135703
[107]	train-rmse:0.118586	eval-rmse:0.125647	train-rmspe:0.1

[192]	train-rmse:0.102616	eval-rmse:0.116529	train-rmspe:0.121199	eval-rmspe:0.124508
[193]	train-rmse:0.102481	eval-rmse:0.116448	train-rmspe:0.121039	eval-rmspe:0.124367
[194]	train-rmse:0.10232	eval-rmse:0.116361	train-rmspe:0.120846	eval-rmspe:0.124219
[195]	train-rmse:0.102192	eval-rmse:0.116311	train-rmspe:0.120629	eval-rmspe:0.124159
[196]	train-rmse:0.102057	eval-rmse:0.116244	train-rmspe:0.120522	eval-rmspe:0.124085
[197]	train-rmse:0.10187	eval-rmse:0.116113	train-rmspe:0.12029	eval-rmspe:0.123842
[198]	train-rmse:0.101721	eval-rmse:0.116051	train-rmspe:0.120133	eval-rmspe:0.123796
[199]	train-rmse:0.101579	eval-rmse:0.116034	train-rmspe:0.119944	eval-rmspe:0.123758
[200]	train-rmse:0.101478	eval-rmse:0.115984	train-rmspe:0.11984	eval-rmspe:0.123695
[201]	train-rmse:0.101339	eval-rmse:0.115964	train-rmspe:0.119671	eval-rmspe:0.123669
[202]	train-rmse:0.101247	eval-rmse:0.115968	train-rmspe:0.119562	eval-rmspe:0.123676
[203]	train-rmse:0.101176	eval-rmse:0.115949	train-rmspe:0

[288]	train-rmse:0.091914	eval-rmse:0.112312	train-rmspe:0.106865	eval-rmspe:0.11948
[289]	train-rmse:0.091806	eval-rmse:0.112299	train-rmspe:0.106758	eval-rmspe:0.119477
[290]	train-rmse:0.091722	eval-rmse:0.112322	train-rmspe:0.106653	eval-rmspe:0.119487
[291]	train-rmse:0.091573	eval-rmse:0.112287	train-rmspe:0.102885	eval-rmspe:0.11946
[292]	train-rmse:0.091515	eval-rmse:0.112271	train-rmspe:0.102815	eval-rmspe:0.119436
[293]	train-rmse:0.091424	eval-rmse:0.11222	train-rmspe:0.102681	eval-rmspe:0.119349
[294]	train-rmse:0.091336	eval-rmse:0.112189	train-rmspe:0.102598	eval-rmspe:0.119269
[295]	train-rmse:0.091292	eval-rmse:0.112187	train-rmspe:0.102556	eval-rmspe:0.119266
[296]	train-rmse:0.091127	eval-rmse:0.112125	train-rmspe:0.102381	eval-rmspe:0.119171
[297]	train-rmse:0.091048	eval-rmse:0.112107	train-rmspe:0.102306	eval-rmspe:0.119161
[298]	train-rmse:0.090958	eval-rmse:0.112064	train-rmspe:0.102214	eval-rmspe:0.119118
[299]	train-rmse:0.090903	eval-rmse:0.112053	train-rmspe:

[384]	train-rmse:0.084294	eval-rmse:0.110103	train-rmspe:0.092014	eval-rmspe:0.11699
[385]	train-rmse:0.084225	eval-rmse:0.110091	train-rmspe:0.091922	eval-rmspe:0.116971
[386]	train-rmse:0.084152	eval-rmse:0.110072	train-rmspe:0.091838	eval-rmspe:0.116954
[387]	train-rmse:0.084039	eval-rmse:0.110058	train-rmspe:0.0917	eval-rmspe:0.116916
[388]	train-rmse:0.083988	eval-rmse:0.110056	train-rmspe:0.091645	eval-rmspe:0.116918
[389]	train-rmse:0.083942	eval-rmse:0.110042	train-rmspe:0.091594	eval-rmspe:0.116903
[390]	train-rmse:0.083875	eval-rmse:0.110048	train-rmspe:0.091504	eval-rmspe:0.116908
[391]	train-rmse:0.083818	eval-rmse:0.110018	train-rmspe:0.091444	eval-rmspe:0.116871
[392]	train-rmse:0.08375	eval-rmse:0.11003	train-rmspe:0.091371	eval-rmspe:0.116897
[393]	train-rmse:0.083678	eval-rmse:0.110025	train-rmspe:0.090476	eval-rmspe:0.116892
[394]	train-rmse:0.083605	eval-rmse:0.110024	train-rmspe:0.090392	eval-rmspe:0.116895
[395]	train-rmse:0.083516	eval-rmse:0.109984	train-rmspe:0.

In [10]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))


Validating
RMSPE: 0.116878


In [7]:
joblib.dump(gbm, 'model.pkl')

NameError: name 'gbm' is not defined

In [None]:
test[features].columns

In [None]:
#train[features].head().append(train[features].tail())
#a.to_csv("test_final.csv", index=False)

In [None]:
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
#print(dtest)
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"],"Date":test["Date"] ,'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)


In [26]:
create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=True, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)


In [66]:
print("#################################################")

#################################################


In [31]:
a=test[features]
b=pd.DataFrame(a.iloc[2:3,])
b

Unnamed: 0,Store,CompetitionDistance,Promo,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
2,1,1270.0,1,0.0,3,1,0,6,9,15,2019,37,132.0,0.0,0


In [51]:
data = [[1,1270,1,0,3,1,0,6,9,15,2019,37,132,0,0]]
df = pd.DataFrame(data,columns=['Store','CompetitionDistance','Promo','SchoolHoliday','StoreType','Assortment',
'StateHoliday','DayOfWeek','Month','Day','Year','WeekOfYear','CompetitionOpen','PromoOpen','IsPromoMonth'])

In [54]:
model = joblib.load("model.pkl")
print("Make predictions on the test set")
#pr = pd.read_csv('test.csv')
dtest =xgb.DMatrix(df)
test_probs = model.predict(dtest)

#result = pd.DataFrame({"Id": test["Id"],"Date":test["Date"] ,'Sales': np.expm1(test_probs)})
#result.to_csv("pickel_model.csv", index=False)
result=test_probs
np.expm1(result)

Make predictions on the test set


array([4190.5186], dtype=float32)

In [53]:
test[features].iloc[1:10,]

Unnamed: 0,Store,CompetitionDistance,Promo,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
1,1,1270.0,1,0.0,3,1,0,0,9,16,2019,38,132.0,0.0,0
2,1,1270.0,1,0.0,3,1,0,6,9,15,2019,37,132.0,0.0,0
3,1,1270.0,1,0.0,3,1,0,5,9,14,2019,37,132.0,0.0,0
4,1,1270.0,0,0.0,3,1,0,4,9,13,2019,37,132.0,0.0,0
5,1,1270.0,0,0.0,3,1,0,0,12,9,2019,50,135.0,0.0,0
6,1,1270.0,0,0.0,3,1,0,5,11,9,2019,45,134.0,0.0,0
7,1,1270.0,0,0.0,3,1,0,2,10,9,2019,41,133.0,0.0,0
8,1,1270.0,0,0.0,3,1,0,0,9,9,2019,37,132.0,0.0,0
9,1,1270.0,0,0.0,3,1,0,4,8,9,2019,32,131.0,0.0,0
