**Total time to run the solution file is around 80 min.**

In [1]:
import numpy as np 
import pandas as pd 
import os
import time
import Levenshtein
from tqdm import tqdm
import re
import gc

from scipy.stats import skew, kurtosis

import matplotlib.pyplot as plt
%matplotlib inline

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
start = time.time()

**Please change the paths as per your local configuration**

In [3]:
input_path = "../input/edgeverve2/"
output_path = ""
kera_model_path = "../input/kera-model/"

**Get data**

In [4]:
sample_submission = pd.read_csv(input_path+"sample_submission.csv")
train_df = pd.read_csv(input_path+"Train.csv")
test_df = pd.read_csv(input_path+"Test.csv")

In [5]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)

**Some EDA**

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5566 entries, 0 to 5565
Data columns (total 6 columns):
Inv_Id              5566 non-null int64
Vendor_Code         5566 non-null object
GL_Code             5566 non-null object
Inv_Amt             5566 non-null float64
Item_Description    5566 non-null object
Product_Category    5566 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 261.0+ KB


In [7]:
train_df.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,Artworking/Typesetting Production Jun 2009 Champion Parts Inc SMAP Prototype and Comp Production/Packaging Design,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar Auto Leasing and Maintenance Other Corporate Services,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,Store Management Lease/Rent Deltona Corp Real Estate Base Rent Jul2018,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,Store Construction General Requirements Colonial Trust Iii General Contractor Final Site Clean Up 2005-Dec,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,Jul 2015 Aydin Corp Contingent Labor/Temp Labor Contingent Labor/Temp Labor Corporate Services Human Resources,CLASS-1376


In [8]:
test_df.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description
0,15003,VENDOR-2513,GL-6050310,56.13,Travel and Entertainment Miscellaneous Company Car (Field Only) Ground Transportation Miscellaneous Company Car (Field Only) Oct2011 Fortune National Corp
1,15008,VENDOR-1044,GL-6101400,96.56,Final Site Clean Up Store Construction Advanced Micro Devices Inc Oct2011 General Requirements General Contractor
2,15013,VENDOR-1254,GL-6101400,55.93,Arabian American Development Co Final Site Clean Up 2008-Oct General Requirements General Contractor Store Construction
3,15019,VENDOR-1331,GL-2182000,32.62,Corporate Services Contingent Labor/Temp Labor Human Resources Contingent Labor/Temp Labor Jun 2014 Aydin Corp
4,15020,VENDOR-2513,GL-6050310,25.81,Fortune National Corp Miscellaneous Company Car (Field Only) Jun-2015 Miscellaneous Company Car (Field Only) Ground Transportation Travel and Entertainment


In [9]:
sample_submission.head()

Unnamed: 0,Inv_Id,Product_Category
0,1,CLASS-784
1,2,CLASS-784
2,3,CLASS-784
3,4,CLASS-784
4,5,CLASS-784


In [10]:
print('train size ',train_df.shape)
print('test size ',test_df.shape)
print('test train ratio : ',(test_df.shape[0]/train_df.shape[0]))

train size  (5566, 6)
test size  (2446, 5)
test train ratio :  0.43945382680560546


**Preprocessing**

In [11]:
def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist
def count_chars(x):
        n_l = 0 # count letters
        n_n = 0 # count numbers
        n_s = 0 # count symbols
        n_ul = 0 # count upper letters
        n_ll = 0 # count lower letters
        for i in range(0, len(x)):
            if x[i].isalpha():
                n_l += 1
                if x[i].isupper():
                    n_ul += 1
                elif x[i].islower():
                    n_ll += 1
            elif x[i].isdigit():
                n_n += 1
            else:
                n_s += 1

        return pd.Series([n_l, n_n, n_s, n_ul, n_ll])
def strstat(x):
    r = np.array([ord(c) for c in x])
    return pd.Series([
        np.sum(r), 
        np.mean(r), 
        np.std(r), 
        np.min(r), 
        np.max(r),
        skew(r), 
        kurtosis(r),
        ])

In [12]:
le = preprocessing.LabelEncoder()

def process_for_features(df):
    cols = ['n_l', 'n_n', 'n_s', 'n_ul', 'n_ll']
    for c in cols:
        df[c] = 0
    tqdm.pandas(desc='count_chars')
    df[cols] = df['Item_Description'].progress_apply(lambda x: count_chars(x))
    df['Vendor_gl-code'] = df[['Vendor_Code','GL_Code']].apply(lambda x : ''.join(x),axis = 1)
    le.fit(df['Vendor_gl-code'])
    df['Vendor_gl-code'] = le.transform(df['Vendor_gl-code'])
    le.fit(df['Vendor_Code'])
    df['Vendor_Code'] = le.transform(df['Vendor_Code'])
    le.fit(df['GL_Code'])
    df['GL_Code'] = le.transform(df['GL_Code'])
    #Making the product category as int
    df.Product_Category = df.Product_Category.str[6:]
    df["Product_Category"] = pd.to_numeric(df["Product_Category"],errors = "coerce")
    #Getting some new features
    gb = df.groupby(['Vendor_Code'],as_index = False).agg({'Inv_Amt':{'Vendor_code_sum':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='Vendor_Code',how = 'left').fillna(0)
    gb = df.groupby(['GL_Code'],as_index = False).agg({'Inv_Amt':{'GL_Code_sum':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='GL_Code',how = 'left').fillna(0)
    gb = df.groupby(['Vendor_Code'],as_index = False).agg({'Inv_Amt':{'Vendor_code_mean':'mean'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='Vendor_Code',how = 'left').fillna(0)
    gb = df.groupby(['GL_Code'],as_index = False).agg({'Inv_Amt':{'GL_Code_mean':'mean'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='GL_Code',how = 'left').fillna(0)
    gb = df.groupby(['Vendor_Code'],as_index = False).agg({'Product_Category':{'Vendor_code_pc_mean':'mean'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='Vendor_Code',how = 'left').fillna(0)
    gb = df.groupby(['GL_Code'],as_index = False).agg({'Product_Category':{'GL_Code_pc_mean':'mean'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='GL_Code',how = 'left').fillna(0)
    gb = df.groupby(['Vendor_Code'],as_index = False).agg({'Product_Category':{'Vendor_code_pc_sum':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='Vendor_Code',how = 'left').fillna(0)
    gb = df.groupby(['GL_Code'],as_index = False).agg({'Product_Category':{'GL_Code_pc_sum':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    df = pd.merge(df,gb,on ='GL_Code',how = 'left').fillna(0)
    cumsum = df.groupby(['GL_Code'])['Product_Category'].cumsum() - df['Product_Category']
    cumcnt = df.groupby(['GL_Code']).cumcount()
    df['GL_Code_mean_target'] = cumsum/cumcnt
    cumsum = df.groupby(['Vendor_Code'])['Product_Category'].cumsum() - df['Product_Category']
    cumcnt = df.groupby(['Vendor_Code']).cumcount()
    df['Vendor_Code_mean_target'] = cumsum/cumcnt
    df = df.fillna(0)
    #processing the item description field 
    df['Item_Description'] = df['Item_Description'].str.lower()
    temp = []
    for text in df['Item_Description']:
        # Removing special characters and punctuations
        text = re.sub(r'[?|!|\'|"|#]',r'',text)
        text = re.sub(r'[.|,|)|(|\|/]',r' ',text)
        # Removing digits
        text = re.sub(r'[0-9]',r' ',text) 
        # Removing months
        text = re.sub('(\s*)jan(\s*)|(\s*)feb(\s*)|(\s*)mar(\s*)|(\s*)apr(\s*)|(\s*)may(\s*)|(\s*)jun(\s*)|(\s*)jul(\s*)|(\s*)aug(\s*)|(\s*)sep(\s*)|(\s*)oct(\s*)|(\s*)nov(\s*)|(\s*)dec(\s*)',' ',
                      text)
        temp.append(text)
    df['Item_Description'] = temp
    temp = []
    for text in df['Item_Description']:
        # Getting unique words in each sentence and sorting them and removing white spaces
        text = str(sorted(' '.join(unique_list(text.split()))))
        text = text.strip()
        temp.append(text)
    df['Item_Description_sorted'] = temp
    cols = ['str_sum', 'str_mean', 'str_std', 'str_min', 'str_max', 'str_skew', 'str_kurtosis']
    for c in cols:
        df[c] = 0
    tqdm.pandas(desc='strstat')
    df[cols] = df['Item_Description'].progress_apply(lambda x: strstat(x))
    df['nunique'] = df['Item_Description'].apply(lambda x : len(np.unique(x)))
    tqdm.pandas(desc='distances')
    #Getting new fature with Levenshtein distance
    df['Levenshtein_distance'] = df['Item_Description'].progress_apply(lambda x: Levenshtein.distance(x, x[::-1]))
    df['Levenshtein_distance_sorted'] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.distance(x, x[::-1]))
    df['Levenshtein_ratio'] = df['Item_Description'].progress_apply(lambda x: Levenshtein.ratio(x, x[::-1]))
    df['Levenshtein_ratio_sorted'] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.ratio(x, x[::-1]))
    df['Levenshtein_jaro'] = df['Item_Description'].progress_apply(lambda x: Levenshtein.jaro(x, x[::-1]))
    df['Levenshtein_jaro_sorted'] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.jaro(x, x[::-1]))
    df['Levenshtein_hamming'] = df['Item_Description'].progress_apply(lambda x: Levenshtein.hamming(x, x[::-1]))
    df['Levenshtein_hamming_sorted'] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.hamming(x, x[::-1]))
    for m in range(1, 5):
        df['Levenshtein_distance_m{}'.format(m)] = df['Item_Description'].progress_apply(lambda x: Levenshtein.distance(x[:-m], x[m:]))
        df['Levenshtein_ratio_m{}'.format(m)] = df['Item_Description'].progress_apply(lambda x: Levenshtein.ratio(x[:-m], x[m:]))
        df['Levenshtein_jaro_m{}'.format(m)] = df['Item_Description'].progress_apply(lambda x: Levenshtein.jaro(x[:-m], x[m:]))
        df['Levenshtein_hamming_m{}'.format(m)] = df['Item_Description'].progress_apply(lambda x: Levenshtein.hamming(x[:-m], x[m:]))
        df['Levenshtein_distance_m{}_sorted'.format(m)] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.distance(x[:-m], x[m:]))
        df['Levenshtein_ratio_m{}_sorted'.format(m)] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.ratio(x[:-m], x[m:]))
        df['Levenshtein_jaro_m{}_sorted'.format(m)] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.jaro(x[:-m], x[m:]))
        df['Levenshtein_hamming_m{}_sorted'.format(m)] = df['Item_Description_sorted'].progress_apply(lambda x: Levenshtein.hamming(x[:-m], x[m:]))
    #Getting new feature with TF IDF
    tf_idf = TfidfVectorizer()
    text_fitted = tf_idf.fit_transform(df['Item_Description'])
    n = min(25,text_fitted.shape[1]-1)
    svd = TruncatedSVD(n_components= n, n_iter=25, random_state=12)
    truncated_tfidf = svd.fit_transform(text_fitted)
    f_cols = ['f'+str(c) for c in range(n)]
    all_columns = np.append(df.columns.values,f_cols)
    int_cols = [c for c in df if df[c].dtype == 'int64']
    float_cols = [c for c in df if df[c].dtype == 'float64']
    df = np.hstack([df,truncated_tfidf])
    df = pd.DataFrame(df,index = range(len(df)),columns = all_columns)
    #Downcasting to reduce memory usage
    df[int_cols] = df[int_cols].astype(np.int32)
    df[float_cols] = df[float_cols].astype(np.float32)
    df[f_cols] = df[f_cols].astype(np.float32)
    x_columns = list(df.columns.values)
    x_columns.remove('Inv_Id')
    x_columns.remove('Item_Description')
    x_columns.remove('Product_Category')
    x_columns.remove('Item_Description_sorted')
    y_column = ['Product_Category']
    train = df[x_columns][:train_df.shape[0]:]
    target = df[y_column][:train_df.shape[0]]
    test = df[x_columns][train_df.shape[0]:]
    x_train, x_val, y_train, y_val = train_test_split(train,target, test_size=0.4, random_state=42)
    return x_train,x_val,y_train,y_val,test,train,target



In [13]:
all_data = pd.concat([train_df,test_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


**Train Validation split**

In [14]:
x_train, x_val, y_train, y_val,test,train,target = process_for_features(all_data)

count_chars: 100%|██████████| 8012/8012 [00:02<00:00, 2811.69it/s]
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
strstat: 100%|██████████| 8012/8012 [00:11<00:00, 728.16it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 31862.52it/s]
distances: 100%|██████████| 8012/8012 [00:03<00:00, 2409.94it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 34796.69it/s]
distances: 100%|██████████| 8012/8012 [00:03<00:00, 2612.19it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 83937.62it/s]
distances: 100%|██████████| 8012/8012 [00:01<00:00, 4634.07it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 370186.21it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 329193.82it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 31993.35it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 34978.15it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 86510.10it/s]
distances: 100%|██████████| 8012/8012 [00:00<00:00, 364540.09it/s]
distance

In [15]:
best_score = 0
best_model = None
best_pred = None
bags = 10
seed = 1

**Random Forest**

In [16]:
model_rf = RandomForestClassifier(n_estimators=100,max_depth = 20, random_state=42)
model_rf.fit(x_train,y_train)
pred_val_rf = model_rf.predict(x_val)
acc_score = accuracy_score(y_val, pred_val_rf)
print("accuracy score %f "%acc_score)

  


accuracy score 0.992366 


In [17]:
feature_importances = pd.DataFrame(model_rf.feature_importances_,index = 
                                   x_train.columns,columns=['importance']).sort_values('importance',ascending=False)

In [18]:
feature_importances

Unnamed: 0,importance
f3,0.060416
f2,0.048496
f1,0.047413
Levenshtein_hamming_m4,0.045913
Levenshtein_hamming_m2,0.040942
GL_Code_pc_mean,0.036268
GL_Code_mean_target,0.035888
str_sum,0.034317
f0,0.033891
GL_Code_sum,0.033805


**XG Boost**

**Tuning parameters of xgboost**

In [19]:
# model_xgb = XGBClassifier(n_estimator = 100,seed = 42)
# params = {
#     'max_depth' : range(3,10,2),
#     'min_child_weight' : range(1,6,2)
    
# }
# gsearch = GridSearchCV(estimator = model_xgb, 
# param_grid = params, scoring='accuracy',n_jobs=4,iid=False, cv=5)
# gsearch.fit(x_train,y_train)
# gsearch.best_params_, gsearch.best_score_

In [20]:
#Parameters obtained from tuning above
model_xgb = XGBClassifier(max_depth = 7, min_child_weight = 1,n_estimator = 100,seed = 42)
model_xgb.fit(x_train,y_train)
pred_val_xgb = model_xgb.predict(x_val)
acc_score = accuracy_score(y_val, pred_val_xgb)
print("accuracy score %f "%acc_score)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


accuracy score 0.993714 


**Scaling for linear models**

In [21]:
scaler = StandardScaler()
x_train_linear = scaler.fit_transform(x_train)
x_val_linear = scaler.fit_transform(x_val)
test_linear = scaler.fit_transform(test)

**Logistic regression**

In [22]:
model_log = LogisticRegression(max_iter = 500,random_state=42, solver='lbfgs',multi_class='multinomial')
model_log.fit(x_train_linear,y_train)
pred_val_log = model_log.predict(x_val_linear)
acc_score = accuracy_score(y_val, pred_val_log)
print("accuracy score %f "%acc_score)

  y = column_or_1d(y, warn=True)


accuracy score 0.987876 


**SVM**

In [23]:
# params = {'C' : [1,10,100,1000],
#           'gamma' : [1e-3,1e-4]}
# model_svc = GridSearchCV(SVC(random_state = 42),params,cv = 5)
# print('Best C : ',model_svc.best_estimator_.C)
# print('Best gamma : ',model_svc.best_estimator_.gamma)
model_svc = SVC(C = 1000,gamma = 0.001,random_state = 42)
model_svc.fit(x_train_linear,y_train)
pred_val_svc = model_svc.predict(x_val_linear)
acc_score = accuracy_score(y_val, pred_val_svc)
print("accuracy score %f "%acc_score)

  y = column_or_1d(y, warn=True)


accuracy score 0.988774 


**Ensembling SVM and XGB**

In [24]:
# alphas_to_try = np.linspace(0, 1, 1001)

# best_alpha = 0
# best_acc = 0
# combined = []
# combined.append(np.c_[pred_val_svc, pred_val_xgb])
# combined = pd.DataFrame(np.vstack(combined))
# combined = combined.astype(np.int32)
# y_val = y_val.astype(np.int32)
# #print(combined)
# for alpha in alphas_to_try:
#     #print(alpha)
#     mix = alpha * combined[0] + (1-alpha)*combined[1]
#     mix = mix.astype(np.int32)
#     acc = accuracy_score(y_val, mix)
#     #print(acc)
#     if best_acc < acc:
#         best_acc = acc
#         best_alpha = alpha


# print('Best alpha: %f; Corresponding accuracy score on val: %f' % (best_alpha, best_acc))

**LSTM**

In [25]:
# num_words = 2000
# tokenizer = Tokenizer(num_words = num_words)
# tokenizer.fit_on_texts(all_data['Item_Description'].values)
# train_test = tokenizer.texts_to_sequences(all_data['Item_Description'].values)
# train_test = pad_sequences(train_test, maxlen=2000)
# train_test = np.column_stack((train_test,pd.get_dummies(all_data['GL_Code']).values))
# train_test = np.column_stack((train_test,pd.get_dummies(all_data['Vendor_Code']).values))
# X = train_test[:train_df.shape[0]]
# test = train_test[train_df.shape[0]:]
# # Build out our simple LSTM
# embed_dim = 128
# lstm_out = 196
# num_class = len(train_df['Product_Category'].unique())
# print(num_class)
# # Model saving callback
# ckpt_callback = ModelCheckpoint('keras_model', 
#                                  monitor='val_loss', 
#                                  verbose=1, 
#                                  save_best_only=True, 
#                                  mode='auto')

# model = Sequential()
# model.add(Embedding(num_words, embed_dim, input_length = X.shape[1]))
# model.add(LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2))
# model.add(Dense(num_class,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
# print(model.summary())
# Y = pd.get_dummies(train_df['Product_Category']).values
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 42)
# print(X_train.shape, Y_train.shape)
# print(X_test.shape, Y_test.shape)
# batch_size = 32
# # model.fit(X_train, Y_train, epochs=4, batch_size=batch_size, validation_split=0.2, callbacks=[ckpt_callback])


In [26]:
# model = load_model('keras_model')
# print(model.summary())
# probas = model.predict(X_test)



In [27]:
# print(probas.shape)
# print(X_test.shape)

In [28]:
# pred_indices = np.argmax(probas,axis = 1)
# #print(pred_indices)
# #print(Y_test.shape)
# #print(pd.get_dummies(train_df['Product_Category']).dtypes)
# classes = np.array(sorted(train_df['Product_Category'].unique()))
# #print(classes)
# pred_lstm = classes[pred_indices]
# #print(pred_lstm.shape)
# y_val = classes[np.argmax(Y_test,axis = 1)]
# acc_score = accuracy_score(y_val, pred_lstm)
# print("accuracy score %f "%acc_score)

**Stacking**

In [29]:
# model = model_xgb
# pred_val_bag = np.zeros(x_val.shape[0])
# pred_test_bag = np.zeros(test.shape[0])
# for n in range(0,bags):
#     model.set_params(seed = seed + n)
#     model.fit(train,target)
#     pred_val = model.predict(x_val)
#     pred_test = model.predict(test)
#     pred_val_bag += pred_val
#     pred_test_bag += pred_test
# pred_val_bag = pred_val_bag/bags
# pred_test_bag= pred_test_bag/bags
# pred1 = pred_val_bag.astype(np.int32)
# test_pred1 = pred_test_bag.astype(np.int32)

# model = model_rf
# pred_val_bag = np.zeros(x_val.shape[0])
# pred_test_bag = np.zeros(test.shape[0])
# for n in range(0,bags):
#     model.set_params(random_state = seed + n)
#     model.fit(train,target)
#     pred_val = model.predict(x_val)
#     pred_test = model.predict(test)
#     pred_val_bag += pred_val
#     pred_test_bag += pred_test
# pred_val_bag = pred_val_bag/bags
# pred_test_bag= pred_test_bag/bags
# pred2 = pred_val_bag.astype(np.int32)
# test_pred2 = pred_test_bag.astype(np.int32)


In [30]:
# stacked_predictions = np.column_stack((pred1,pred2))
# stacked_test_predictions = np.column_stack((test_pred1,test_pred2))
# model_svc.fit(stacked_predictions,y_val)
# final_prediction = model_svc.predict(stacked_test_predictions)
# best_pred = final_prediction
# best_pred

**Xgboost with bagging is giving me the best leaderboard score so I will keep this for final submission**

In [31]:
model = model_xgb
pred_test_bag = np.zeros(test.shape[0])
for n in range(0,bags):
    model.set_params(seed = seed + n)
    model.fit(train,target)
    pred_test = model.predict(test)
    pred_test_bag += pred_test
pred_test_bag= pred_test_bag/bags
pred_test_bag = pred_test_bag.astype(np.int32)
best_pred = pred_test_bag

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [32]:
pred = best_pred

In [33]:
pred = pred.astype(np.int32)

In [34]:
pred = ['CLASS-' + str(c) for c in pred]

In [35]:
sub = pd.DataFrame({'Inv_Id':test_df['Inv_Id'],'Product_Category':pred})

In [36]:
sub.head()

Unnamed: 0,Inv_Id,Product_Category
0,15003,CLASS-1758
1,15008,CLASS-1522
2,15013,CLASS-1522
3,15019,CLASS-1376
4,15020,CLASS-1758


In [37]:
sub.to_csv(output_path + "submission.csv",index = False)

In [38]:
end = time.time()
print('Total time : %f seconds'%(end-start))

Total time : 525.961371 seconds
