In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, log_loss

<h3>Random Forest</h3>

In [95]:
df_train = pd.read_csv('data/processed_data/processed_train.csv')
df_test = pd.read_csv('data/processed_data/processed_test.csv')

In [96]:
cols = df_train.columns
cols

Index(['VisitNumber', 'TripType', 'Weekday', 'Upc', 'ScanCount',
       'FinelineNumber', 'FLCount', 'VisitFLCount', 'NumPurchases',
       'NumReturns',
       ...
       'UPC_4900005010.0', 'UPC_60538862097.0', 'UPC_60538871457.0',
       'UPC_60538871461.0', 'UPC_60538896309.0', 'UPC_68113107862.0',
       'UPC_68113163351.0', 'UPC_7874235186.0', 'UPC_7874235187.0',
       'UPC_7874235188.0'],
      dtype='object', length=305)

In [97]:
df_train.isnull().sum()

VisitNumber                    0
TripType                       0
Weekday                        0
Upc                            0
ScanCount                      0
FinelineNumber                 0
FLCount                        0
VisitFLCount                   0
NumPurchases                   0
NumReturns                     0
VisitNumPurchases              0
VisitNumReturns                0
MaxD                           0
MeanD                          0
MinD                           0
MaxV                           0
MeanV                          0
MinV                           0
1-HR PHOTO                     0
ACCESSORIES                    0
AUTOMOTIVE                     0
BAKERY                         0
BATH AND SHOWER                0
BEAUTY                         0
BEDDING                        0
BOOKS AND MAGAZINES            0
BOYS WEAR                      0
BRAS & SHAPEWEAR               0
CAMERAS AND SUPPLIES           0
CANDY, TOBACCO, COOKIES        0
          

In [76]:
df_train = df_train.fillna(0)

In [77]:
X_train = df_train.filter(features, axis=1)
y_train = df_train.filter(['TripType'], axis=1)

In [134]:
clf = RandomForestClassifier()
pprint(clf.get_params())

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [135]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train.values.ravel())

In [None]:
rf_random.best_params_

In [None]:
best_random = rf_random.best_estimator_

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(train_features, train_labels)
grid_search.best_params_
{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}
best_grid = grid_search.best_estimator_

Index(['TripType', 'Weekday', 'Upc', 'ScanCount', 'FinelineNumber', 'FLCount',
       'VisitFLCount', 'NumPurchases', 'NumReturns', 'VisitNumPurchases',
       ...
       'UPC_4900005010.0', 'UPC_60538862097.0', 'UPC_60538871457.0',
       'UPC_60538871461.0', 'UPC_60538896309.0', 'UPC_68113107862.0',
       'UPC_68113163351.0', 'UPC_7874235186.0', 'UPC_7874235187.0',
       'UPC_7874235188.0'],
      dtype='object', length=304)

In [80]:
df_test.shape

(95674, 303)

In [81]:
df_train.head()

Unnamed: 0_level_0,TripType,Weekday,Upc,ScanCount,FinelineNumber,FLCount,VisitFLCount,NumPurchases,NumReturns,VisitNumPurchases,...,UPC_4900005010.0,UPC_60538862097.0,UPC_60538871457.0,UPC_60538871461.0,UPC_60538896309.0,UPC_68113107862.0,UPC_68113163351.0,UPC_7874235186.0,UPC_7874235187.0,UPC_7874235188.0
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,999.0,4.0,68113150000.0,-1.0,1000.0,26.0,1.0,10206.0,1205.0,0.0,...,,,,,,,,,,
7,30.0,4.0,33974810000.0,1.0,6717.5,390.5,2.0,25848.0,529.0,2.0,...,,,,,,,,,,
8,26.0,4.0,18660600000.0,1.217391,2594.347826,151.391304,16.0,10458.347826,180.347826,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,8.0,4.0,356695900.0,1.0,1906.333333,373.0,3.0,46602.666667,712.666667,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,8.0,4.0,5669755000.0,1.0,3357.666667,263.333333,3.0,57757.333333,584.333333,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df_try = pd.read_csv('data/test.csv', index_col=0)
df_try = df_try.groupby("VisitNumber")
len(df_try.groups)

95674

In [7]:
cols = df_train.columns
print(cols)

Index(['TripType', 'Weekday', 'Upc', 'ScanCount', 'FinelineNumber', 'FLCount',
       'VisitFLCount', 'NumPurchases', 'NumReturns', 'VisitNumPurchases',
       ...
       'UPC_4900005010.0', 'UPC_60538862097.0', 'UPC_60538871457.0',
       'UPC_60538871461.0', 'UPC_60538896309.0', 'UPC_68113107862.0',
       'UPC_68113163351.0', 'UPC_7874235186.0', 'UPC_7874235187.0',
       'UPC_7874235188.0'],
      dtype='object', length=304)


In [75]:
features=[]
#features = [i for i in cols[7:-1]]
#features.append()
features.extend(list(cols[2:]))
print(features)

['Weekday', 'Upc', 'ScanCount', 'FinelineNumber', 'FLCount', 'VisitFLCount', 'NumPurchases', 'NumReturns', 'VisitNumPurchases', 'VisitNumReturns', 'MaxD', 'MeanD', 'MinD', 'MaxV', 'MeanV', 'MinV', '1-HR PHOTO', 'ACCESSORIES', 'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING', 'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR', 'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION', 'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY', 'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES', 'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14', 'GROCERY DRY GOODS', 'HARDWARE', 'HEALTH AND BEAUTY AIDS', 'HOME DECOR', 'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS', 'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS', 'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES', 'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR', 'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER', 'MEAT - FRESH & FROZEN', 'MEDIA A

In [9]:
#X = df.iloc[:, 2]
X_train = df_train.filter(features, axis=1)
y_train = df_train.filter(['TripType'], axis=1)
#X_train.columns

In [10]:
X_test = df_test.filter(features, axis=1)
len(X_test.columns)
#df_test.columns

303

In [25]:
df_train['TripType'].unique()

array([ 999.,   30.,   26.,    8.,   35.,   41.,   21.,    6.,   42.,
          7.,    9.,   39.,   25.,   38.,   15.,   36.,   20.,   37.,
         32.,   40.,    5.,    3.,    4.,   24.,   33.,   43.,   31.,
         27.,   34.,   18.,   29.,   44.,   19.,   23.,   22.,   28.,
         14.,   12.])

In [78]:
df = df_train
df.head()

Unnamed: 0,VisitNumber,TripType,Weekday,Upc,ScanCount,FinelineNumber,FLCount,VisitFLCount,NumPurchases,NumReturns,...,UPC_4900005010.0,UPC_60538862097.0,UPC_60538871457.0,UPC_60538871461.0,UPC_60538896309.0,UPC_68113107862.0,UPC_68113163351.0,UPC_7874235186.0,UPC_7874235187.0,UPC_7874235188.0
0,5,999.0,4.0,68113150000.0,-1.0,1000.0,26.0,1.0,10206.0,1205.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,30.0,4.0,33974810000.0,1.0,6717.5,390.5,2.0,25848.0,529.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,26.0,4.0,18660600000.0,1.217391,2594.347826,151.391304,16.0,10458.347826,180.347826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,8.0,4.0,356695900.0,1.0,1906.333333,373.0,3.0,46602.666667,712.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,8.0,4.0,5669755000.0,1.0,3357.666667,263.333333,3.0,57757.333333,584.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
X = df.filter(features, axis=1)
y = df.filter(['TripType'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [80]:
X.shape, y.shape

((95674, 303), (95674, 1))

In [81]:
X_train.isnull().sum()

Weekday                        0
Upc                            0
ScanCount                      0
FinelineNumber                 0
FLCount                        0
VisitFLCount                   0
NumPurchases                   0
NumReturns                     0
VisitNumPurchases              0
VisitNumReturns                0
MaxD                           0
MeanD                          0
MinD                           0
MaxV                           0
MeanV                          0
MinV                           0
1-HR PHOTO                     0
ACCESSORIES                    0
AUTOMOTIVE                     0
BAKERY                         0
BATH AND SHOWER                0
BEAUTY                         0
BEDDING                        0
BOOKS AND MAGAZINES            0
BOYS WEAR                      0
BRAS & SHAPEWEAR               0
CAMERAS AND SUPPLIES           0
CANDY, TOBACCO, COOKIES        0
CELEBRATION                    0
COMM BREAD                     0
          

In [91]:
#clf = RandomForestClassifier(max_depth=25, max_features=200, class_weight="balanced")
#clf = RandomForestClassifier(max_depth=15, n_estimators=300)
clf = RandomForestClassifier(max_depth=20)
clf.fit(X_train, y_train.values.ravel())
preds = clf.predict(X_test)
predictions = [round(value) for value in preds]
precision_recall_fscore_support(y_test, predictions, average='micro')

(0.6581889589205967, 0.6581889589205967, 0.6581889589205967, None)

In [92]:
pred = clf.predict_proba(X_test)
eval = log_loss(y_test,pred) 
eval

1.2973308689419405

In [93]:
clf.fit(X, y.values.ravel())
preds = clf.predict(df_test)
predictions = [round(value) for value in preds]
p = clf.predict_proba(df_test)

ValueError: Number of features of the model must match the input. Model n_features is 303 and input n_features is 304 

In [61]:
df_test.columns

Index(['Weekday', 'Upc', 'ScanCount', 'FinelineNumber', 'FLCount',
       'VisitFLCount', 'NumPurchases', 'NumReturns', 'VisitNumPurchases',
       'VisitNumReturns',
       ...
       'UPC_4900005010.0', 'UPC_60538862097.0', 'UPC_60538871457.0',
       'UPC_60538871461.0', 'UPC_60538896309.0', 'UPC_68113107862.0',
       'UPC_68113163351.0', 'UPC_7874235186.0', 'UPC_7874235187.0',
       'UPC_7874235188.0'],
      dtype='object', length=303)

In [52]:
targets = list(np.unique(y_train))
visits = df_test.VisitNumber.values #data2 is test dataframe
targets = ["TripType_"+str(int(i)) for i in targets]
out = pd.DataFrame(p,columns = targets,index=None) #p is predicted probs
out.insert(0,'VisitNumber',visits)
out.to_csv("output_rf.csv",index=False)

AttributeError: 'DataFrame' object has no attribute 'VisitNumber'

In [81]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train.values.ravel())
preds = clf.predict(X_test)
predictions = [round(value) for value in preds]
precision_recall_fscore_support(y_test, predictions, average='micro')

(0.20136550260644923, 0.20136550260644923, 0.20136550260644923, None)

In [82]:
pred = clf.predict_proba(X_test)
eval = log_loss(y_test,pred) 
eval

3.1272752529910313

In [54]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [58]:
model = XGBClassifier()
model.fit(X_train, y_train.values.ravel())
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [59]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
pred = clf.predict_proba(X_test)
eval = log_loss(y_test,pred) 
eval

Accuracy: 70.63%


3.1272752529910517

In [139]:
vals = list(zip(features, clf.feature_importances_))
#print(vals)
print(sorted(vals, reverse=True))

[('Weekday', 0.04580812797403936), ('VisitNumReturns', 0.022180573917955349), ('VisitNumPurchases', 0.12557592799238396), ('VisitFLCount', 0.097764446706796099), ('TOYS', 0.0092241222733867329), ('SWIMWEAR/OUTERWEAR', 0.0017032293879948406), ('SPORTING GOODS', 0.0076634358560197817), ('SLEEPWEAR/FOUNDATIONS', 0.0041689378376874725), ('SHOES', 0.009525330115179264), ('SHEER HOSIERY', 0.0010433489298120162), ('SERVICE DELI', 0.011764841418872759), ('SEASONAL', 0.00010598555115685646), ('SEAFOOD', 0.0037283903393144058), ('PRODUCE', 0.042135312608029815), ('PRE PACKED DELI', 0.017999823856351727), ('PLUS AND MATERNITY', 0.0009546819340758763), ('PLAYERS AND ELECTRONICS', 0.0011833903401876776), ('PHARMACY RX', 0.00019449737712353655), ('PHARMACY OTC', 0.021694788348997414), ('PETS AND SUPPLIES', 0.010262749888948226), ('PERSONAL CARE', 0.028096296360971783), ('PAINT AND ACCESSORIES', 0.0029437348788393693), ('OTHER DEPARTMENTS', 9.0730426893990501e-05), ('OPTICAL - LENSES', 0.000158750522

In [3]:
df = pd.read_csv("data/train.csv")

In [20]:
df

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,6.811315e+10,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,6.053882e+10,1,SHOES,8931.0
2,30,7,Friday,7.410811e+09,1,PERSONAL CARE,4504.0
3,26,8,Friday,2.238404e+09,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2.006614e+09,2,PAINT AND ACCESSORIES,1017.0
5,26,8,Friday,2.006619e+09,2,PAINT AND ACCESSORIES,1017.0
6,26,8,Friday,2.006614e+09,1,PAINT AND ACCESSORIES,1017.0
7,26,8,Friday,7.004803e+09,1,PAINT AND ACCESSORIES,2802.0
8,26,8,Friday,2.238495e+09,1,PAINT AND ACCESSORIES,4501.0
9,26,8,Friday,2.238400e+09,-1,PAINT AND ACCESSORIES,3565.0


In [21]:
freq_upc = df['Upc'].value_counts().keys()[:20]
freq_upc_features = ["UPC_"+str(upc) for upc in freq_upc]
freq_upc_features.extend(["INDICATOR_UPC_"+str(upc) for upc in freq_upc])
freq_upc_features

['UPC_4011.0',
 'UPC_60538862097.0',
 'UPC_7874235186.0',
 'UPC_7874235187.0',
 'UPC_4046.0',
 'UPC_68113107862.0',
 'UPC_60538871457.0',
 'UPC_3338320027.0',
 'UPC_4087.0',
 'UPC_60538871461.0',
 'UPC_4900000044.0',
 'UPC_4062.0',
 'UPC_4065.0',
 'UPC_4900003165.0',
 'UPC_3338365020.0',
 'UPC_7874235188.0',
 'UPC_4900005010.0',
 'UPC_68113163351.0',
 'UPC_60538896309.0',
 'UPC_4078.0',
 'INDICATOR_UPC_4011.0',
 'INDICATOR_UPC_60538862097.0',
 'INDICATOR_UPC_7874235186.0',
 'INDICATOR_UPC_7874235187.0',
 'INDICATOR_UPC_4046.0',
 'INDICATOR_UPC_68113107862.0',
 'INDICATOR_UPC_60538871457.0',
 'INDICATOR_UPC_3338320027.0',
 'INDICATOR_UPC_4087.0',
 'INDICATOR_UPC_60538871461.0',
 'INDICATOR_UPC_4900000044.0',
 'INDICATOR_UPC_4062.0',
 'INDICATOR_UPC_4065.0',
 'INDICATOR_UPC_4900003165.0',
 'INDICATOR_UPC_3338365020.0',
 'INDICATOR_UPC_7874235188.0',
 'INDICATOR_UPC_4900005010.0',
 'INDICATOR_UPC_68113163351.0',
 'INDICATOR_UPC_60538896309.0',
 'INDICATOR_UPC_4078.0']

In [17]:
freq_fineline = df['FinelineNumber'].value_counts().keys()[:20]
freq_fineline_features = ["FL_"+str(fl) for fl in freq_fineline]
freq_fineline_features.extend(["INDICATOR_FL_"+str(fl) for fl in freq_fineline]) 
freq_fineline_features;

In [16]:
if 5502 in freq_fineline:
    print("Y")

In [25]:
df_groups = df.groupby("VisitNumber").groups
df_groups

{5: Int64Index([0], dtype='int64'),
 7: Int64Index([1, 2], dtype='int64'),
 8: Int64Index([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
             20, 21, 22, 23, 24, 25],
            dtype='int64'),
 9: Int64Index([26, 27, 28], dtype='int64'),
 10: Int64Index([29, 30, 31], dtype='int64'),
 11: Int64Index([32, 33, 34, 35], dtype='int64'),
 12: Int64Index([36, 37, 38, 39, 40, 41, 42], dtype='int64'),
 15: Int64Index([43, 44, 45, 46, 47, 48, 49, 50], dtype='int64'),
 17: Int64Index([51, 52, 53, 54], dtype='int64'),
 19: Int64Index([55, 56, 57, 58, 59, 60, 61, 62, 63], dtype='int64'),
 20: Int64Index([64, 65], dtype='int64'),
 23: Int64Index([66, 67], dtype='int64'),
 25: Int64Index([68, 69, 70], dtype='int64'),
 26: Int64Index([71, 72, 73, 74, 75, 76, 77, 78, 79], dtype='int64'),
 28: Int64Index([80, 81, 82, 83, 84, 85, 86, 87], dtype='int64'),
 29: Int64Index([88], dtype='int64'),
 30: Int64Index([89], dtype='int64'),
 31: Int64Index([90, 91], dtype='int64'),
 3

In [26]:
data = []
for visit, ids in df_groups.items():
    #print(dept_desc)
    #indicator_upc = 0
    #indicator_fl = 0
    for idx in ids:
        upc_count = 0
        fl_count = 0
        if df.loc[idx]['Upc'] in freq_upc:
            #indicator_upc = 1
            upc_count = 1
            feat1 = "UPC_"+str(df.loc[idx]['Upc'])
            feat2 = "INDICATOR_UPC_"+str(df.loc[idx]['Upc'])
            data.append({"VisitNumber":visit, feat1:upc_count, feat2:1})
        if df.loc[idx]['FinelineNumber'] in freq_fineline:
            #indicator_upc = 1
            fl_count = 1
            feat1 = "FL_"+str(df.loc[idx]['FinelineNumber'])
            feat2 = "INDICATOR_FL_"+str(df.loc[idx]['FinelineNumber'])
            data.append({"VisitNumber":visit, feat1:fl_count, feat2:1})

In [66]:
data

[{'FL_4606.0': 1, 'INDICATOR_FL_4606.0': 1, 'VisitNumber': 8},
 {'FL_115.0': 1, 'INDICATOR_FL_115.0': 1, 'VisitNumber': 9},
 {'INDICATOR_UPC_4011.0': 1, 'UPC_4011.0': 1, 'VisitNumber': 9},
 {'FL_5501.0': 1, 'INDICATOR_FL_5501.0': 1, 'VisitNumber': 9},
 {'FL_115.0': 1, 'INDICATOR_FL_115.0': 1, 'VisitNumber': 10},
 {'FL_808.0': 1, 'INDICATOR_FL_808.0': 1, 'VisitNumber': 11},
 {'FL_110.0': 1, 'INDICATOR_FL_110.0': 1, 'VisitNumber': 11},
 {'FL_203.0': 1, 'INDICATOR_FL_203.0': 1, 'VisitNumber': 12},
 {'FL_4010.0': 1, 'INDICATOR_FL_4010.0': 1, 'VisitNumber': 20},
 {'INDICATOR_UPC_4011.0': 1, 'UPC_4011.0': 1, 'VisitNumber': 23},
 {'FL_5501.0': 1, 'INDICATOR_FL_5501.0': 1, 'VisitNumber': 23},
 {'INDICATOR_UPC_4011.0': 1, 'UPC_4011.0': 1, 'VisitNumber': 26},
 {'FL_5501.0': 1, 'INDICATOR_FL_5501.0': 1, 'VisitNumber': 26},
 {'FL_100.0': 1, 'INDICATOR_FL_100.0': 1, 'VisitNumber': 32},
 {'FL_9546.0': 1, 'INDICATOR_FL_9546.0': 1, 'VisitNumber': 40},
 {'FL_115.0': 1, 'INDICATOR_FL_115.0': 1, 'VisitNu

In [70]:
df2 = pd.DataFrame(data).fillna(0).set_index("VisitNumber")
df2

Unnamed: 0_level_0,FL_0.0,FL_100.0,FL_110.0,FL_115.0,FL_135.0,FL_1407.0,FL_1508.0,FL_203.0,FL_3004.0,FL_3120.0,...,UPC_4900005010.0,UPC_60538862097.0,UPC_60538871457.0,UPC_60538871461.0,UPC_60538896309.0,UPC_68113107862.0,UPC_68113163351.0,UPC_7874235186.0,UPC_7874235187.0,UPC_7874235188.0
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
import numpy as np

In [41]:
indicators = ["INDICATOR_FL_"+str(fl) for fl in freq_fineline]
indicators.extend(["INDICATOR_UPC_"+str(upc) for upc in freq_upc])
nonindicators = ["UPC_"+str(upc) for upc in freq_upc]
nonindicators.extend(["FL_"+str(fl) for fl in freq_fineline])
print([ind  for ind in indicators])
aggregatios = {
    [ind ":" np.mean for ind in indicators],
    [nonind ":" np.sum  for nonind in nonindicators]
}

SyntaxError: invalid syntax (<ipython-input-41-745375f94412>, line 7)

In [74]:
df3 = df2.groupby("VisitNumber").sum()
df3[indicators] = df3[indicators].astype(bool).astype(int)
df3.head()

Unnamed: 0_level_0,FL_0.0,FL_100.0,FL_110.0,FL_115.0,FL_135.0,FL_1407.0,FL_1508.0,FL_203.0,FL_3004.0,FL_3120.0,...,UPC_4900005010.0,UPC_60538862097.0,UPC_60538871457.0,UPC_60538871461.0,UPC_60538896309.0,UPC_68113107862.0,UPC_68113163351.0,UPC_7874235186.0,UPC_7874235187.0,UPC_7874235188.0
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
df = df.join(df3, on='VisitNumber', how='left')
df.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,FL_0.0,FL_100.0,FL_110.0,...,UPC_4900005010.0,UPC_60538862097.0,UPC_60538871457.0,UPC_60538871461.0,UPC_60538896309.0,UPC_68113107862.0,UPC_68113163351.0,UPC_7874235186.0,UPC_7874235187.0,UPC_7874235188.0
0,999,5,Friday,68113150000.0,-1,FINANCIAL SERVICES,1000.0,,,,...,,,,,,,,,,
1,30,7,Friday,60538820000.0,1,SHOES,8931.0,,,,...,,,,,,,,,,
2,30,7,Friday,7410811000.0,1,PERSONAL CARE,4504.0,,,,...,,,,,,,,,,
3,26,8,Friday,2238404000.0,2,PAINT AND ACCESSORIES,3565.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26,8,Friday,2006614000.0,2,PAINT AND ACCESSORIES,1017.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
df2 = df2.agg({df2[indicators] : np.mean, 
               df2[nonindicators] : np.sum

In [75]:
df3.to_csv("Check2.csv")