In [4]:
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [5]:
data_original = pd.read_csv("train.csv")

In [6]:
w_test_data = pd.read_csv("test.csv")

In [7]:
def transform_my_data(data):
    
    dummies = pd.get_dummies(data.DepartmentDescription)
    data[dummies.columns] = dummies 
    data['Weekday'] = data['Weekday'].map({"Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, 
                                           "Saturday": 6, "Sunday": 7})
    data_dummies = data.iloc[:,7:]
    data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
    data_dummies = data_dummies.replace(-0,0)

    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    data = data[["TripType", "VisitNumber", "Weekday", "ScanCount", "Return"]]
    data = data.rename(columns={"ScanCount":"NumItems"})
    data = pd.concat([data, data_dummies], axis=1)
    grouped = data.groupby("VisitNumber")
    grouped = grouped.agg({'Weekday': np.max, "TripType": np.max, 'NumItems': np.sum, 'Return': np.max, 
              '1-HR PHOTO': np.sum, 'ACCESSORIES': np.sum,
       'AUTOMOTIVE': np.sum, 'BAKERY': np.sum, 'BATH AND SHOWER': np.sum, 'BEAUTY': np.sum, 'BEDDING': np.sum,
       'BOOKS AND MAGAZINES': np.sum, 'BOYS WEAR': np.sum, 'BRAS & SHAPEWEAR': np.sum,
       'CAMERAS AND SUPPLIES': np.sum, 'CANDY, TOBACCO, COOKIES': np.sum, 'CELEBRATION': np.sum,
       'COMM BREAD': np.sum, 'CONCEPT STORES': np.sum, 'COOK AND DINE': np.sum, 'DAIRY': np.sum, 'DSD GROCERY': np.sum,
       'ELECTRONICS': np.sum, 'FABRICS AND CRAFTS': np.sum, 'FINANCIAL SERVICES': np.sum,
       'FROZEN FOODS': np.sum, 'FURNITURE': np.sum, 'GIRLS WEAR, 4-6X  AND 7-14': np.sum,
       'GROCERY DRY GOODS': np.sum, 'HARDWARE': np.sum, 'HEALTH AND BEAUTY AIDS': np.sum, 'HOME DECOR': np.sum,
       'HOME MANAGEMENT': np.sum, 'HORTICULTURE AND ACCESS': np.sum,
       'HOUSEHOLD CHEMICALS/SUPP': np.sum, 'HOUSEHOLD PAPER GOODS': np.sum,
       'IMPULSE MERCHANDISE': np.sum, 'INFANT APPAREL': np.sum, 'INFANT CONSUMABLE HARDLINES': np.sum,
       'JEWELRY AND SUNGLASSES': np.sum, 'LADIES SOCKS': np.sum, 'LADIESWEAR': np.sum,
       'LARGE HOUSEHOLD GOODS': np.sum, 'LAWN AND GARDEN': np.sum, 'LIQUOR,WINE,BEER': np.sum,
       'MEAT - FRESH & FROZEN': np.sum, 'MEDIA AND GAMING': np.sum, 'MENS WEAR': np.sum, 'MENSWEAR': np.sum,
       'OFFICE SUPPLIES': np.sum, 'OPTICAL - FRAMES': np.sum, 'OPTICAL - LENSES': np.sum,
       'OTHER DEPARTMENTS': np.sum, 'PAINT AND ACCESSORIES': np.sum, 'PERSONAL CARE': np.sum,
       'PETS AND SUPPLIES': np.sum, 'PHARMACY OTC': np.sum, 'PHARMACY RX': np.sum,
       'PLAYERS AND ELECTRONICS': np.sum, 'PLUS AND MATERNITY': np.sum, 'PRE PACKED DELI': np.sum,
       'PRODUCE': np.sum, 'SEAFOOD': np.sum, 'SEASONAL': np.sum, 'SERVICE DELI': np.sum, 'SHEER HOSIERY': np.sum,
       'SHOES': np.sum, 'SLEEPWEAR/FOUNDATIONS': np.sum, 'SPORTING GOODS': np.sum,
       'SWIMWEAR/OUTERWEAR': np.sum, 'TOYS': np.sum, 'WIRELESS': np.sum})
    data = grouped[["TripType", "Weekday", "NumItems", "Return",'1-HR PHOTO', 'ACCESSORIES',
           'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING',
           'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR',
           'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION',
           'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY',
           'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES',
           'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14',
           'GROCERY DRY GOODS', 'HARDWARE', 'HEALTH AND BEAUTY AIDS', 'HOME DECOR',
           'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS',
           'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS',
           'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES',
           'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR',
           'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER',
           'MEAT - FRESH & FROZEN', 'MEDIA AND GAMING', 'MENS WEAR', 'MENSWEAR',
           'OFFICE SUPPLIES', 'OPTICAL - FRAMES', 'OPTICAL - LENSES',
           'OTHER DEPARTMENTS', 'PAINT AND ACCESSORIES', 'PERSONAL CARE',
           'PETS AND SUPPLIES', 'PHARMACY OTC', 'PHARMACY RX',
           'PLAYERS AND ELECTRONICS', 'PLUS AND MATERNITY', 'PRE PACKED DELI',
           'PRODUCE', 'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY',
           'SHOES', 'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS',
           'SWIMWEAR/OUTERWEAR', 'TOYS', 'WIRELESS']]
    features = ["Weekday", "NumItems", "Return", 'ACCESSORIES',
       'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING',
       'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR',
       'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION',
       'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY',
       'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES',
       'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14',
       'GROCERY DRY GOODS', 'HARDWARE', 'HOME DECOR',
       'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS',
       'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS',
       'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES',
       'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR',
       'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER',
       'MEAT - FRESH & FROZEN', 'MEDIA AND GAMING', 'MENS WEAR', 'MENSWEAR',
       'OFFICE SUPPLIES', 'OPTICAL - FRAMES', 'OPTICAL - LENSES',
       'OTHER DEPARTMENTS', 'PAINT AND ACCESSORIES', 'PERSONAL CARE',
       'PETS AND SUPPLIES', 'PHARMACY OTC', 'PHARMACY RX',
       'PLAYERS AND ELECTRONICS', 'PLUS AND MATERNITY', 'PRE PACKED DELI',
       'PRODUCE', 'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY',
       'SHOES', 'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS',
       'SWIMWEAR/OUTERWEAR', 'TOYS', 'WIRELESS']
    data['TripType'] = data['TripType'].map({3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9,  
                                         18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18,  
                                         27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 
                                         36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37})
    return data

In [8]:
def transform_w_data(data):
    dummies = pd.get_dummies(data.DepartmentDescription)
    data[dummies.columns] = dummies 
    data['Weekday'] = data['Weekday'].map({"Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5,
                                           "Saturday": 6, "Sunday": 7})
    data_dummies = data.iloc[:,7:]
    data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
    data_dummies = data_dummies.replace(-0,0)
    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    data = data[["VisitNumber", "Weekday", "ScanCount", "Return"]]
    data = data.rename(columns={"ScanCount":"NumItems"})
    data = pd.concat([data, data_dummies], axis=1)
    grouped = data.groupby("VisitNumber")
    grouped = grouped.agg({'Weekday': np.max, 'NumItems': np.sum, 'Return': np.max, 
                'ACCESSORIES': np.sum,
           'AUTOMOTIVE': np.sum, 'BAKERY': np.sum, 'BATH AND SHOWER': np.sum, 'BEAUTY': np.sum, 'BEDDING': np.sum,
           'BOOKS AND MAGAZINES': np.sum, 'BOYS WEAR': np.sum, 'BRAS & SHAPEWEAR': np.sum,
           'CAMERAS AND SUPPLIES': np.sum, 'CANDY, TOBACCO, COOKIES': np.sum, 'CELEBRATION': np.sum,
           'COMM BREAD': np.sum, 'CONCEPT STORES': np.sum, 'COOK AND DINE': np.sum, 'DAIRY': np.sum, 'DSD GROCERY': np.sum,
           'ELECTRONICS': np.sum, 'FABRICS AND CRAFTS': np.sum, 'FINANCIAL SERVICES': np.sum,
           'FROZEN FOODS': np.sum, 'FURNITURE': np.sum, 'GIRLS WEAR, 4-6X  AND 7-14': np.sum,
           'GROCERY DRY GOODS': np.sum, 'HARDWARE': np.sum, 'HOME DECOR': np.sum,
           'HOME MANAGEMENT': np.sum, 'HORTICULTURE AND ACCESS': np.sum,
           'HOUSEHOLD CHEMICALS/SUPP': np.sum, 'HOUSEHOLD PAPER GOODS': np.sum,
           'IMPULSE MERCHANDISE': np.sum, 'INFANT APPAREL': np.sum, 'INFANT CONSUMABLE HARDLINES': np.sum,
           'JEWELRY AND SUNGLASSES': np.sum, 'LADIES SOCKS': np.sum, 'LADIESWEAR': np.sum,
           'LARGE HOUSEHOLD GOODS': np.sum, 'LAWN AND GARDEN': np.sum, 'LIQUOR,WINE,BEER': np.sum,
           'MEAT - FRESH & FROZEN': np.sum, 'MEDIA AND GAMING': np.sum, 'MENS WEAR': np.sum, 'MENSWEAR': np.sum,
           'OFFICE SUPPLIES': np.sum, 'OPTICAL - FRAMES': np.sum, 'OPTICAL - LENSES': np.sum,
           'OTHER DEPARTMENTS': np.sum, 'PAINT AND ACCESSORIES': np.sum, 'PERSONAL CARE': np.sum,
           'PETS AND SUPPLIES': np.sum, 'PHARMACY OTC': np.sum, 'PHARMACY RX': np.sum,
           'PLAYERS AND ELECTRONICS': np.sum, 'PLUS AND MATERNITY': np.sum, 'PRE PACKED DELI': np.sum,
           'PRODUCE': np.sum, 'SEAFOOD': np.sum, 'SEASONAL': np.sum, 'SERVICE DELI': np.sum, 'SHEER HOSIERY': np.sum,
           'SHOES': np.sum, 'SLEEPWEAR/FOUNDATIONS': np.sum, 'SPORTING GOODS': np.sum,
           'SWIMWEAR/OUTERWEAR': np.sum, 'TOYS': np.sum, 'WIRELESS': np.sum})
    data = grouped[["Weekday", "NumItems", "Return", 'ACCESSORIES',
               'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING',
               'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR',
               'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION',
               'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY',
               'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES',
               'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14',
               'GROCERY DRY GOODS', 'HARDWARE', 'HOME DECOR',
               'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS',
               'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS',
               'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES',
               'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR',
               'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER',
               'MEAT - FRESH & FROZEN', 'MEDIA AND GAMING', 'MENS WEAR', 'MENSWEAR',
               'OFFICE SUPPLIES', 'OPTICAL - FRAMES', 'OPTICAL - LENSES',
               'OTHER DEPARTMENTS', 'PAINT AND ACCESSORIES', 'PERSONAL CARE',
               'PETS AND SUPPLIES', 'PHARMACY OTC', 'PHARMACY RX',
               'PLAYERS AND ELECTRONICS', 'PLUS AND MATERNITY', 'PRE PACKED DELI',
               'PRODUCE', 'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY',
               'SHOES', 'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS',
               'SWIMWEAR/OUTERWEAR', 'TOYS', 'WIRELESS']]
    #data['TripType'] = data['TripType'].map({3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9,  
                                         #18:10, 19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18,  
                                         #27:19, 28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 
                                         #36:28, 37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37})
    return data

In [9]:
data = transform_my_data(data_original)

In [10]:
def create_sparse_matrix(original_data):
    dummies = pd.get_dummies(original_data.FinelineNumber)
    dummies = dummies[data_original[data_original.FinelineNumber < 200].FinelineNumber.unique()]
    data_visit = original_data[["VisitNumber"]]
    dummy_data = pd.concat([dummies, data_visit], axis=1)
    dummy_data = dummy_data.groupby("VisitNumber")
    total_dummy_data = dummy_data.aggregate(np.sum)

    dummies = pd.get_dummies(original_data.FinelineNumber)
    for x in range(200,9000,100):
        dummies1 = dummies[data_original[(data_original.FinelineNumber >= x) & (data_original.FinelineNumber < x + 100)].FinelineNumber.unique()]
        data_visit = original_data[["VisitNumber"]]
        dummy_data = pd.concat([dummies1, data_visit], axis=1)
        dummy_data = dummy_data.groupby("VisitNumber")
        dummy_data = dummy_data.aggregate(np.sum)
        total_dummy_data = hstack((total_dummy_data, dummy_data))
    return total_dummy_data

In [11]:
from scipy import sparse

In [12]:
from scipy.sparse import hstack

In [13]:
sparse_matrix_10000 = create_sparse_matrix(data_original)

In [14]:
sparse_matrix_10000

<95674x4933 sparse matrix of type '<class 'numpy.float64'>'
	with 525471 stored elements in COOrdinate format>

In [15]:
data_sparse = sparse.bsr_matrix(data)

In [16]:
data_total = hstack((data_sparse, sparse_matrix_10000))

In [17]:
data_total

<95674x5005 sparse matrix of type '<class 'numpy.float64'>'
	with 1136314 stored elements in COOrdinate format>

In [18]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

  if 'order' in inspect.getargspec(np.copy)[0]:


In [19]:
from numpy import ndarray

In [20]:
import scipy

In [21]:
mytrain, mytest = train_test_split(data_total, test_size = .4)

In [22]:
dtrain = xgb.DMatrix(np.asarray(scipy.sparse.bsr_matrix.todense(mytrain[:,1:])), 
                     label = np.asarray(scipy.sparse.bsr_matrix.todense(mytrain[:,0:1])))
dtest = xgb.DMatrix(np.asarray(scipy.sparse.bsr_matrix.todense(mytest[:,1:])), 
                     label = np.asarray(scipy.sparse.bsr_matrix.todense(mytest[:,0:1])))

In [23]:
num_round = 200
param = {'objective': 'multi:softprob', 'num_class':38, 
         'eval_metric': 'mlogloss', "max_delta_step": 5}
param1 = {'objective': 'multi:softprob', 'num_class':38, 
          "max_delta_step": 5}
watchlist = [(dtrain,'train'), (dtest, 'eval')]

In [24]:
bst = xgb.train(param, dtrain, num_round, watchlist, 
                early_stopping_rounds=3)

Will train until eval error hasn't decreased in 3 rounds.
[0]	train-mlogloss:2.418286	eval-mlogloss:2.451951
[1]	train-mlogloss:1.827536	eval-mlogloss:1.887070
[2]	train-mlogloss:1.557440	eval-mlogloss:1.633707
[3]	train-mlogloss:1.375843	eval-mlogloss:1.463932
[4]	train-mlogloss:1.245261	eval-mlogloss:1.343406
[5]	train-mlogloss:1.145715	eval-mlogloss:1.252771
[6]	train-mlogloss:1.067957	eval-mlogloss:1.183301
[7]	train-mlogloss:1.005965	eval-mlogloss:1.128171
[8]	train-mlogloss:0.954837	eval-mlogloss:1.083696
[9]	train-mlogloss:0.912663	eval-mlogloss:1.047915
[10]	train-mlogloss:0.876510	eval-mlogloss:1.017767
[11]	train-mlogloss:0.844441	eval-mlogloss:0.991749
[12]	train-mlogloss:0.816946	eval-mlogloss:0.969792
[13]	train-mlogloss:0.794031	eval-mlogloss:0.951881
[14]	train-mlogloss:0.772917	eval-mlogloss:0.935496
[15]	train-mlogloss:0.752720	eval-mlogloss:0.920314
[16]	train-mlogloss:0.736034	eval-mlogloss:0.908151
[17]	train-mlogloss:0.720725	eval-mlogloss:0.897533
[18]	train-mlogl

In [25]:
my_test = xgb.DMatrix(np.asarray(scipy.sparse.bsr_matrix.todense(mytest)[:,1:]))
test_predictions = bst.predict(my_test)

In [26]:
log_loss(scipy.sparse.bsr_matrix.todense(mytest[:,0:1]), test_predictions).round(5)

0.73895

In [27]:
bst1 = xgb.train(param1, dtrain, num_round, watchlist, 
                early_stopping_rounds=3)

Will train until eval error hasn't decreased in 3 rounds.
[0]	train-merror:0.427113	eval-merror:0.440397
[1]	train-merror:0.331040	eval-merror:0.356624
[2]	train-merror:0.298254	eval-merror:0.329240
[3]	train-merror:0.282280	eval-merror:0.316749
[4]	train-merror:0.274563	eval-merror:0.311706
[5]	train-merror:0.269180	eval-merror:0.309093
[6]	train-merror:0.260766	eval-merror:0.303554
[7]	train-merror:0.257439	eval-merror:0.301280
[8]	train-merror:0.253937	eval-merror:0.299582
[9]	train-merror:0.249181	eval-merror:0.297779
[10]	train-merror:0.245558	eval-merror:0.295924
[11]	train-merror:0.240175	eval-merror:0.292292
[12]	train-merror:0.236029	eval-merror:0.290123
[13]	train-merror:0.232196	eval-merror:0.287510
[14]	train-merror:0.227754	eval-merror:0.285446
[15]	train-merror:0.224375	eval-merror:0.283825
[16]	train-merror:0.221518	eval-merror:0.282493
[17]	train-merror:0.219148	eval-merror:0.281552
[18]	train-merror:0.216832	eval-merror:0.280873
[19]	train-merror:0.213800	eval-merror:0

74% accuracy - improving by 2%

In [28]:
w_test_data_original = pd.read_csv("test 2 (1).csv")
w_test_data = pd.read_csv("test 2 (1).csv")


In [29]:
test = transform_w_data(w_test_data)

In [30]:
def add_category_counts_test(data):
    alist = []
    for array in np.asarray(data.iloc[:,3:]):
        count = 0
        for item in array:
            if item > 0:
                count += 1
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(3, 'CategoryCounts', cat_counts)
    return data

In [31]:
test = add_category_counts_test(test)

In [32]:
test_data_sparse = sparse.bsr_matrix(test)

In [33]:
#original_test_data_sparse = create_sparse_matrix(w_test_data_original)

In [34]:
walmart_test = xgb.DMatrix(np.asarray(test))
predictions = bst.predict(walmart_test)

In [35]:
def predictions_to_csv(test_predictions):
    test_predictions = pd.DataFrame(test_predictions)
    test_indexes = test.index
    test_predictions = test_predictions.rename(columns = {0:"TripType_3", 1: "TripType_4", 2: "TripType_5", 3: "TripType_6", 
                4: "TripType_7", 5: "TripType_8", 6: "TripType_9", 7: "TripType_12", 8: "TripType_14", 
                9: "TripType_15", 10: "TripType_18", 11: "TripType_19", 12: "TripType_20", 13: "TripType_21",
                14: "TripType_22", 15: "TripType_23", 16: "TripType_24", 17: "TripType_25", 18: "TripType_26", 
                19: "TripType_27", 20: "TripType_28", 21: "TripType_29", 22: "TripType_30", 23: "TripType_31", 
                24: "TripType_32", 25: "TripType_33", 26: "TripType_34", 27: "TripType_35", 28: "TripType_36", 
                29: "TripType_37", 30: "TripType_38", 31: "TripType_39", 32: "TripType_40", 33: "TripType_41", 
                34: "TripType_42", 35: "TripType_43", 36: "TripType_44", 37: "TripType_999"})
    test_predictions.insert(0, 'VisitNumber', test_indexes)
    return test_predictions.to_csv("submissions/fifth_fineline_xgb.csv", index=False)

In [36]:
def create_sparse_matrix(original_data):
    dummies = pd.get_dummies(original_data.FinelineNumber)
    dummies = dummies[data_original[data_original.FinelineNumber < 200].FinelineNumber.unique()]
    data_visit = original_data[["VisitNumber"]]
    dummy_data = pd.concat([dummies, data_visit], axis=1)
    dummy_data = dummy_data.groupby("VisitNumber")
    total_dummy_data = dummy_data.aggregate(np.sum)

    dummies = pd.get_dummies(original_data.FinelineNumber)
    for x in range(200,400,100):
        dummies1 = dummies[data_original[(data_original.FinelineNumber >= x) & (data_original.FinelineNumber < x + 100)].FinelineNumber.unique()]
        data_visit = original_data[["VisitNumber"]]
        dummy_data = pd.concat([dummies1, data_visit], axis=1)
        dummy_data = dummy_data.groupby("VisitNumber")
        dummy_data = dummy_data.aggregate(np.sum)
        total_dummy_data = hstack((total_dummy_data, dummy_data))
    return total_dummy_data

In [48]:
    #dummies = pd.get_dummies(w_test_data_original.FinelineNumber)
    for x in range(300,400,100):
        dummies1 = dummies[w_test_data_original[(w_test_data_original.FinelineNumber >= x) & (w_test_data_original.FinelineNumber < x + 100)].FinelineNumber.unique()]
        data_visit = w_test_data_original[["VisitNumber"]]
        dummy_data = pd.concat([dummies1, data_visit], axis=1)
        dummy_data = dummy_data.groupby("VisitNumber")
        dummy_data = dummy_data.aggregate(np.sum)
        total_dummy_data = hstack((total_dummy_data, dummy_data))