In [1]:
import pandas as pd
import numpy as np
import numpy.random as nr
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [2]:
#importing files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sample = pd.read_csv('Sample_Submission.csv')

#renaming column title
train = train.rename(columns={'Supermarket _Size': 'Supermarket_Size'})
test = test.rename(columns={'Supermarket _Size': 'Supermarket_Size'})

#filling missing values
train['Product_Weight'] = train['Product_Weight'].fillna(train['Product_Weight'].mean())
train['Supermarket_Size'].fillna(train['Supermarket_Size'].mode()[0],inplace=True)
test['Product_Weight'] = test['Product_Weight'].fillna(test['Product_Weight'].mean())
test['Supermarket_Size'].fillna(test['Supermarket_Size'].mode()[0],inplace=True)

#dropping unwanted features
train = train.drop(['Product_Identifier', 'Product_Supermarket_Identifier'], axis=1)
test = test.drop(['Product_Identifier', 'Product_Supermarket_Identifier'], axis=1)

#converting the year to age of supermarket
train['Supermarket_Opening_Year'] = 2019 - train['Supermarket_Opening_Year']
test['Supermarket_Opening_Year'] = 2019 - test['Supermarket_Opening_Year']

#creating labels
labels = np.array(train['Product_Supermarket_Sales'])
train = train.drop('Product_Supermarket_Sales', axis=1)

# Creating more features

In [3]:
train['sqrt_Product_Price'] = np.sqrt(train['Product_Price'])
test['sqrt_Product_Price'] = np.sqrt(test['Product_Price'])

In [4]:
train['cross_Price_weight'] = train['Product_Price'] * train['Product_Weight']
test['cross_Price_weight'] = test['Product_Price'] * test['Product_Weight']

# Data Preparation

In [5]:
#Numerical train data preparation
num_features = pd.DataFrame()

num_features['Product_Weight'] = train['Product_Weight']
num_features['Product_Shelf_Visibility'] = train['Product_Shelf_Visibility']
num_features['Product_Price'] = train['Product_Price']
num_features['Supermarket_Opening_Year'] = train['Supermarket_Opening_Year']
num_features['Average_Price_per_ProductType'] = train['Average_Price_per_ProductType']
num_features['sqrt_Product_Price'] = train['sqrt_Product_Price']
num_features['cross_Price_weight'] = train['cross_Price_weight']

#Numerical test data preparation
num_features1 = pd.DataFrame()

num_features1['Product_Weight'] = test['Product_Weight']
num_features1['Product_Shelf_Visibility'] = test['Product_Shelf_Visibility']
num_features1['Product_Price'] = test['Product_Price']
num_features1['Supermarket_Opening_Year'] = test['Supermarket_Opening_Year']
num_features1['Average_Price_per_ProductType'] = test['Average_Price_per_ProductType']
num_features1['sqrt_Product_Price'] = test['sqrt_Product_Price']
num_features1['cross_Price_weight'] = test['cross_Price_weight']

In [6]:
#Categorical train data preparation
cat_features = pd.DataFrame()

cat_features['Product_Fat_Content'] = train['Product_Fat_Content']
cat_features['Product_Type'] = train['Product_Type']
cat_features['Supermarket_Identifier'] = train['Supermarket_Identifier']
cat_features['Supermarket_Location_Type'] = train['Supermarket_Location_Type']
cat_features['Supermarket_Type'] = train['Supermarket_Type']
cat_features['Supermarket_Size'] = train['Supermarket_Size']

#Categorical test data preparation
cat_features1 = pd.DataFrame()

cat_features1['Product_Fat_Content'] = test['Product_Fat_Content']
cat_features1['Product_Type'] = test['Product_Type']
cat_features1['Supermarket_Identifier'] = test['Supermarket_Identifier']
cat_features1['Supermarket_Location_Type'] = test['Supermarket_Location_Type']
cat_features1['Supermarket_Type'] = test['Supermarket_Type']
cat_features1['Supermarket_Size'] = test['Supermarket_Size']

In [7]:
#Encoding categorical variables

def encode_string(data):
    enc = preprocessing.LabelEncoder()
    enc.fit(data)
    enc_features = enc.transform(data)
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_features.reshape(-1,1))
    return encoded.transform(enc_features.reshape(-1,1)).toarray()

#categorical columns to be encoded
cat_columns = ['Product_Fat_Content', 'Product_Type', 'Supermarket_Location_Type', 
              'Supermarket_Type', 'Supermarket_Size']

#encoding supermaker_Identifier column 
train_enc = encode_string(train['Supermarket_Identifier'])

#concatenating the One Hot Encoded features for the train data
for col in cat_columns:
    temp1 = encode_string(train[col])
    train_enc = np.concatenate([train_enc, temp1], axis = 1)
    
print(train_enc.shape)


test_enc = encode_string(test['Supermarket_Identifier'])
for col in cat_columns:
    temps = encode_string(test[col])
    test_enc = np.concatenate([test_enc, temps], axis = 1)
    
print(test_enc.shape)

(2994, 39)
(1996, 39)


In [8]:
#concatenating the numerical variables with the categorical variables

train_enc = np.concatenate([train_enc, np.array(num_features[['Product_Weight', 'Product_Shelf_Visibility', 'Product_Price',
                                                                'Supermarket_Opening_Year', 'Average_Price_per_ProductType', 
                                                             'sqrt_Product_Price', 'cross_Price_weight']])], axis = 1)

print(train_enc.shape)

test_enc = np.concatenate([test_enc, np.array(num_features1[['Product_Weight', 'Product_Shelf_Visibility', 'Product_Price',
                                                                'Supermarket_Opening_Year', 'Average_Price_per_ProductType', 
                                                            'sqrt_Product_Price', 'cross_Price_weight']])], axis = 1)

print(test_enc.shape)

(2994, 46)
(1996, 46)


In [9]:
## Randomly splitting into train and test data
nr.seed(9988)
indx = range(train_enc.shape[0])
indx = ms.train_test_split(indx, test_size = 0.3)
x_train = train_enc[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = train_enc[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [10]:
'''
    Randomly splitting the test data into two parts; one for validation during training
    and the other for testing the model
'''

np.random.seed(9988)
indx1 = range(x_test.shape[0])
indx1 = ms.train_test_split(indx1, test_size = 0.5)
x_test1 = x_test[indx1[0],:]
y_test1 = np.ravel(y_test[indx1[0]])
x_test2 = x_test[indx1[1],:]
y_test2 = np.ravel(y_test[indx1[1]])

# WINNING MODEL

In [12]:
'''
    After preparing your data from the above steps, model training can now happen.
    
    Using the sequential method imported from keras.model,
    a sequential Artificial Neural Network will be built
'''
#creating an instance of the sequential class
model = Sequential()

'''
    using model.add, we add layers to the model.
    
    Different types of layers exist, you find them all on the TensorFlow Keras documentations page
    The Dense layer is used for all layers in the model and it takes in a number of arguments.
    
    The first layer is the input layer
    and it is the only layer that takes the input_dim argument, the value should be the number of
    features/columns of data being passed into the model
    
    The model contains a total of four layers( 3 hidden layers and 1 output). The input layer is
    usually not counted as a layer
    
    args: The first parameter (number) refers to the number of neurons in that layer,
          it is also referred to as the number of outputs from that layer
          
    activation: Activation refers to the mathematical function/operation you want to 
                perform on the inputs coming into the layer before outputing it to the next layer
'''
#adding the layers to the models
model.add(Dense(46, input_dim=46, kernel_initializer='normal', activation='relu'))
model.add(Dense(23, activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))

'''
    model.compile is used to 'compile' the model for training, by cobining all layers together,
    with the selected optimizer and loss metric to be used for training
    
    The Adam optimizer is used, other optimizer like SGD (Stochastic Gradient Descent) 
    occurs too but quite situable for specific tasks
'''

model.compile(loss='mean_squared_error', optimizer='adam')

In [13]:
#Call backs are usually used to monitor training process, EarlySTopping is just one of these callbacks
#to stop model training after 40 epochs of which the validation has stopped improving

callback = keras.callbacks.EarlyStopping(monitor='loss', patience=40)

valid_set = (x_test1, y_test1)   #validation set for training 

#fitting the neural network model
'''
model.fit args;
                epochs: number of times you want input to go through the neural net for training
                        Similar to number of estimators in tree based models
                batch_Size: refers to the number of input sample to go through the NN for each epoch
                callbacks: as defined above
'''
model.fit(x_train, y_train, batch_size=10, validation_data = valid_set, epochs=1500, callbacks=callback, shuffle=True, verbose=2)

Epoch 1/1500
240/240 - 3s - loss: 43765024.0000 - val_loss: 16732206.0000
Epoch 2/1500
240/240 - 2s - loss: 18095862.0000 - val_loss: 16501116.0000
Epoch 3/1500
240/240 - 2s - loss: 17946506.0000 - val_loss: 16314337.0000
Epoch 4/1500
240/240 - 2s - loss: 17726142.0000 - val_loss: 16045154.0000
Epoch 5/1500
240/240 - 3s - loss: 17554782.0000 - val_loss: 15829477.0000
Epoch 6/1500
240/240 - 2s - loss: 17365266.0000 - val_loss: 15693608.0000
Epoch 7/1500
240/240 - 2s - loss: 17185524.0000 - val_loss: 15447889.0000
Epoch 8/1500
240/240 - 2s - loss: 17115420.0000 - val_loss: 15253732.0000
Epoch 9/1500
240/240 - 2s - loss: 16850726.0000 - val_loss: 15065937.0000
Epoch 10/1500
240/240 - 2s - loss: 16687993.0000 - val_loss: 14897690.0000
Epoch 11/1500
240/240 - 2s - loss: 16529640.0000 - val_loss: 14813468.0000
Epoch 12/1500
240/240 - 2s - loss: 16334263.0000 - val_loss: 14534654.0000
Epoch 13/1500
240/240 - 2s - loss: 16247813.0000 - val_loss: 14423876.0000
Epoch 14/1500
240/240 - 2s - loss:

Epoch 111/1500
240/240 - 2s - loss: 9915237.0000 - val_loss: 8306446.5000
Epoch 112/1500
240/240 - 2s - loss: 9706699.0000 - val_loss: 9063001.0000
Epoch 113/1500
240/240 - 2s - loss: 9823859.0000 - val_loss: 8267605.5000
Epoch 114/1500
240/240 - 2s - loss: 9696012.0000 - val_loss: 7864105.0000
Epoch 115/1500
240/240 - 2s - loss: 9780166.0000 - val_loss: 7773978.5000
Epoch 116/1500
240/240 - 2s - loss: 9998790.0000 - val_loss: 8887669.0000
Epoch 117/1500
240/240 - 2s - loss: 9981682.0000 - val_loss: 8207461.0000
Epoch 118/1500
240/240 - 2s - loss: 9858853.0000 - val_loss: 7894776.5000
Epoch 119/1500
240/240 - 2s - loss: 9754742.0000 - val_loss: 8008300.5000
Epoch 120/1500
240/240 - 2s - loss: 9727012.0000 - val_loss: 7787039.0000
Epoch 121/1500
240/240 - 2s - loss: 9781165.0000 - val_loss: 7715383.5000
Epoch 122/1500
240/240 - 2s - loss: 9735793.0000 - val_loss: 8249333.0000
Epoch 123/1500
240/240 - 2s - loss: 9899020.0000 - val_loss: 7779008.5000
Epoch 124/1500
240/240 - 2s - loss: 95

Epoch 222/1500
240/240 - 2s - loss: 9352076.0000 - val_loss: 7371952.5000
Epoch 223/1500
240/240 - 2s - loss: 9154808.0000 - val_loss: 8159027.5000
Epoch 224/1500
240/240 - 2s - loss: 9227230.0000 - val_loss: 7448458.0000
Epoch 225/1500
240/240 - 2s - loss: 9279699.0000 - val_loss: 7473716.0000
Epoch 226/1500
240/240 - 2s - loss: 9281054.0000 - val_loss: 7410984.0000
Epoch 227/1500
240/240 - 2s - loss: 9261893.0000 - val_loss: 8125751.0000


<tensorflow.python.keras.callbacks.History at 0x7f6283b7fa50>

In [14]:
#making prediction on the other part of the test data

pred3 = model.predict(x_test2)
mean_squared_error(pred3, y_test2) ** 0.5

2881.8781345473467

In [15]:
#preparing the predictions make into the sample file for submission

preds1 = model.predict(test_enc)
sample1 = pd.read_csv('Sample_Submission.csv')
test = pd.read_csv('Test.csv')
sample1.Product_Supermarket_Identifier = test.Product_Supermarket_Identifier
sample1.Product_Supermarket_Sales = preds1
sample1.to_csv('NN1.csv', index = False)

In [13]:
#pred4 = (pred2 + pred3) / 2

In [17]:
#pred3 = model.predict(x_test2)
#mean_squared_error(pred4, y_test2) ** 0.5