In [519]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [520]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [521]:
#Combine test and train into one file
train['source']='train'
test['source']='test'
data = pd.concat([train, test],ignore_index=True)
print (train.shape, test.shape, data.shape)

(4990, 14) (3532, 13) (8522, 14)


In [522]:
data.isnull().sum()

Product_Fat_Content                  0
Product_Identifier                   0
Product_Price                        0
Product_Shelf_Visibility             0
Product_Supermarket_Identifier       0
Product_Supermarket_Sales         3532
Product_Type                         0
Product_Weight                    1463
Supermarket _Size                 2409
Supermarket_Identifier               0
Supermarket_Location_Type            0
Supermarket_Opening_Year             0
Supermarket_Type                     0
source                               0
dtype: int64

In [523]:
product_avg_weight = data.pivot_table(values='Product_Weight', index='Product_Identifier') #aggfunc is mean by default!
def impute_weight(cols):
    Weight = cols[0]
    Identifier = cols[1]
    
    if pd.isnull(Weight):
        return product_avg_weight['Product_Weight'][product_avg_weight.index == Identifier]
    else:
        return Weight

print ('Orignal missing: %d'% sum(data['Product_Weight'].isnull()))
data['Product_Weight'] = data[['Product_Weight','Product_Identifier']].apply(impute_weight, axis=1).astype(float)
print ('Final missing: %d'% sum(data['Product_Weight'].isnull()))

data['Product_Weight'] = data['Product_Weight'].fillna(12)
sum(data['Product_Weight'].isnull())

Orignal missing: 1463
Final missing: 4


0

In [524]:
supermarket_size_mode = data.pivot_table(values='Supermarket _Size', columns = 'Supermarket_Type',aggfunc=lambda x: x.mode())

def impute_size_mode(cols):
    Size = cols[0]
    Type = cols[1]
    if pd.isnull(Size):
        return supermarket_size_mode.loc['Supermarket _Size'][supermarket_size_mode.columns == Type][0]
    else:
        return Size

print ('Orignal #missing: %d'%sum(data['Supermarket _Size'].isnull()))
data['Supermarket _Size'] = data[['Supermarket _Size','Supermarket_Type']].apply(impute_size_mode,axis=1)
print ('Final #missing: %d'%sum(data['Supermarket _Size'].isnull()))

Orignal #missing: 2409
Final #missing: 0


In [525]:
data['Supermarket_Size'] = data['Supermarket _Size']

In [526]:
#Item type combine:
data['Product_Identifier'].value_counts()
data['Product_Type_Combined'] = data['Product_Identifier'].apply(lambda x: x[0:2])
data['Product_Type_Combined'] = data['Product_Type_Combined'].map({'FD':'Food',
                                                             'NC':'Non-Consumable',
                                                             'DR':'Drinks'})
data['Product_Type_Combined'].value_counts()

Food              6125
Non-Consumable    1598
Drinks             799
Name: Product_Type_Combined, dtype: int64

In [527]:
data['Supermarket_Opening_Year'] = 2016 - data['Supermarket_Opening_Year']

In [528]:
print('Original Categories:')
print(data['Product_Fat_Content'].value_counts())
print('\n')

data['Product_Fat_Content'] = data['Product_Fat_Content'].replace({'Ultra Low fat':'Low Fat'})
print('Modified Categories:')
print(data['Product_Fat_Content'].value_counts())

print('\n')

#Mark non-consumables as separate category in low_fat:
data.loc[data['Product_Type_Combined'] == "Non-Consumable",'Product_Fat_Content'] = "Non-Edible"
data['Product_Fat_Content'].value_counts()

Original Categories:
Low Fat          5200
Normal Fat       3006
Ultra Low fat     316
Name: Product_Fat_Content, dtype: int64


Modified Categories:
Low Fat       5516
Normal Fat    3006
Name: Product_Fat_Content, dtype: int64




Low Fat       3918
Normal Fat    3006
Non-Edible    1598
Name: Product_Fat_Content, dtype: int64

In [529]:
data.head()

Unnamed: 0,Product_Fat_Content,Product_Identifier,Product_Price,Product_Shelf_Visibility,Product_Supermarket_Identifier,Product_Supermarket_Sales,Product_Type,Product_Weight,Supermarket _Size,Supermarket_Identifier,Supermarket_Location_Type,Supermarket_Opening_Year,Supermarket_Type,source,Supermarket_Size,Product_Type_Combined
0,Low Fat,DRA12,357.54,0.068535,DRA12_CHUKWUDI010,709.08,Soft Drinks,11.6,Small,CHUKWUDI010,Cluster 3,11,Grocery Store,train,Small,Drinks
1,Low Fat,DRA12,355.79,0.040912,DRA12_CHUKWUDI013,6381.69,Soft Drinks,11.6,High,CHUKWUDI013,Cluster 3,22,Supermarket Type1,train,High,Drinks
2,Low Fat,DRA12,350.79,0.041178,DRA12_CHUKWUDI017,6381.69,Soft Drinks,11.6,Small,CHUKWUDI017,Cluster 2,2,Supermarket Type1,train,Small,Drinks
3,Low Fat,DRA12,355.04,0.041113,DRA12_CHUKWUDI018,2127.23,Soft Drinks,11.6,Medium,CHUKWUDI018,Cluster 3,0,Supermarket Type2,train,Medium,Drinks
4,Low Fat,DRA12,354.79,0.0,DRA12_CHUKWUDI035,2481.77,Soft Drinks,11.6,Small,CHUKWUDI035,Cluster 2,5,Supermarket Type1,train,Small,Drinks


In [530]:
#Drop the columns which have been converted to different types:
data.drop(['Product_Supermarket_Identifier','Supermarket _Size','Product_Type'],axis=1,inplace=True)

In [531]:
cols = ['Product_Price','Product_Shelf_Visibility','Product_Weight','Supermarket_Opening_Year']
data[cols] = data[cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [532]:
data.head()

Unnamed: 0,Product_Fat_Content,Product_Identifier,Product_Price,Product_Shelf_Visibility,Product_Supermarket_Sales,Product_Weight,Supermarket_Identifier,Supermarket_Location_Type,Supermarket_Opening_Year,Supermarket_Type,source,Supermarket_Size,Product_Type_Combined
0,Low Fat,DRA12,0.474219,0.2087,709.08,0.41947,CHUKWUDI010,Cluster 3,0.458333,Grocery Store,train,Small,Drinks
1,Low Fat,DRA12,0.471247,0.124583,6381.69,0.41947,CHUKWUDI013,Cluster 3,0.916667,Supermarket Type1,train,High,Drinks
2,Low Fat,DRA12,0.462758,0.125392,6381.69,0.41947,CHUKWUDI017,Cluster 2,0.083333,Supermarket Type1,train,Small,Drinks
3,Low Fat,DRA12,0.469974,0.125194,2127.23,0.41947,CHUKWUDI018,Cluster 3,0.0,Supermarket Type2,train,Medium,Drinks
4,Low Fat,DRA12,0.46955,0.0,2481.77,0.41947,CHUKWUDI035,Cluster 2,0.208333,Supermarket Type1,train,Small,Drinks


In [533]:
#Divide into test and train:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

In [534]:
test.drop(['Product_Supermarket_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [643]:
from sklearn.model_selection import train_test_split

y = train['Product_Supermarket_Sales']
X = train.drop(['Product_Supermarket_Sales'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.001, random_state=101)

#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=101)

In [644]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8522 entries, 0 to 8521
Data columns (total 13 columns):
Product_Fat_Content          8522 non-null object
Product_Identifier           8522 non-null object
Product_Price                8522 non-null float64
Product_Shelf_Visibility     8522 non-null float64
Product_Supermarket_Sales    4990 non-null float64
Product_Weight               8522 non-null float64
Supermarket_Identifier       8522 non-null object
Supermarket_Location_Type    8522 non-null object
Supermarket_Opening_Year     8522 non-null float64
Supermarket_Type             8522 non-null object
source                       8522 non-null object
Supermarket_Size             8522 non-null object
Product_Type_Combined        8522 non-null object
dtypes: float64(5), object(8)
memory usage: 865.6+ KB


# Tensorflow

In [645]:
import tensorflow as tf

In [646]:
# continuous feature_columns for the continuous values using numeric_column

price = tf.feature_column.numeric_column('Product_Price')
visibility = tf.feature_column.numeric_column('Product_Shelf_Visibility')
weight = tf.feature_column.numeric_column('Product_Weight')
age = tf.feature_column.numeric_column('Supermarket_Opening_Year')

In [647]:
# tf.feature_columns for the categorical values. Using vocabulary lists or just use hash buckets.

assigned_group_FC = tf.feature_column.categorical_column_with_hash_bucket('Product_Fat_Content', hash_bucket_size=10)
assigned_group_PI = tf.feature_column.categorical_column_with_hash_bucket('Product_Identifier', hash_bucket_size=10)
assigned_group_PT = tf.feature_column.categorical_column_with_hash_bucket('Product_Type_Combined', hash_bucket_size=10)
assigned_group_SS = tf.feature_column.categorical_column_with_hash_bucket('Supermarket_Size', hash_bucket_size=10)
assigned_group_SI = tf.feature_column.categorical_column_with_hash_bucket('Supermarket_Identifier', hash_bucket_size=10)
assigned_group_LT = tf.feature_column.categorical_column_with_hash_bucket('Supermarket_Location_Type', hash_bucket_size=10)
assigned_group_ST = tf.feature_column.categorical_column_with_hash_bucket('Supermarket_Type', hash_bucket_size=10)

In [648]:
embedded_group_FC = tf.feature_column.embedding_column(assigned_group_FC, dimension=3)
embedded_group_PI = tf.feature_column.embedding_column(assigned_group_PI, dimension=1451)
embedded_group_PT = tf.feature_column.embedding_column(assigned_group_PT, dimension=3)
embedded_group_SS = tf.feature_column.embedding_column(assigned_group_SS, dimension=3)
embedded_group_SI = tf.feature_column.embedding_column(assigned_group_SI, dimension=10)
embedded_group_LT = tf.feature_column.embedding_column(assigned_group_LT, dimension=3)
embedded_group_ST = tf.feature_column.embedding_column(assigned_group_ST, dimension=4)

In [649]:
feat_cols = [price, visibility, weight, age,  embedded_group_SS, #embedded_group_SI, embedded_group_FC,
              embedded_group_ST] #, embedded_group_PIembedded_group_LT,embedded_group_PT,

In [650]:
# Batch_size is up to you. But do make sure to shuffle!

input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=100,num_epochs=10000,shuffle=True)

In [651]:
# Create model with tf.estimator
# Create a LinearClassifier.(If you want to use a DNNClassifier, keep in mind you'll need to create 
# embedded columns out of the cateogrical feature that use strings, check out the previous lecture on this for more info.)


model = tf.estimator.DNNRegressor(hidden_units=[14,12,10,8,10,12,14],feature_columns=feat_cols)

#model = tf.estimator.LinearRegressor(feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\MOB005~1.YOS\\AppData\\Local\\Temp\\tmpgcrbp4_p', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001BEAEDB1EB8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [652]:
# Train your model on the data, for at least 5000 steps.

model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\MOB005~1.YOS\AppData\Local\Temp\tmpgcrbp4_p\model.ckpt.
INFO:tensorflow:loss = 4232444700.0, step = 1
INFO:tensorflow:global_step/sec: 68.7624
INFO:tensorflow:loss = 1004514700.0, step = 101 (1.476 sec)
INFO:tensorflow:global_step/sec: 82.3837
INFO:tensorflow:loss = 940802750.0, step = 201 (1.205 sec)
INFO:tensorflow:global_step/sec: 86.376
INFO:tensorflow:loss = 868087360.0, step = 301 (1.157 sec)
INFO:tensorflow:global_step/sec: 91.9381
INFO:tensorflow:loss = 745267900.0, step = 401 (1.139 sec)
INFO:tensorflow:global_step/sec: 88.4045
INFO:tensorflow:loss = 692606850.0, step = 501 (1.075 sec)
INFO:tensorflow:global_step/sec: 101.839
INFO:tensorflow:loss = 905878400.0, step = 601 (0.984 sec)
INFO:t

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x1bead88d240>

In [653]:
# Create a prediction input function. Remember to only supprt X_test data and keep shuffle=False.

predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=10,num_epochs=1,shuffle=False)


# Use model.predict() and pass in your input function. This will produce a generator of predictions, 
# which you can then transform into a list, with list()

pred_gen = model.predict(predict_input_func)

predictions = list(pred_gen)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MOB005~1.YOS\AppData\Local\Temp\tmpgcrbp4_p\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [654]:
final_preds = []
for pred in predictions:
    final_preds.append(pred['predictions'])

from sklearn.metrics import mean_squared_error

mean_squared_error(y_test,final_preds)**0.5

2594.4090912899205

In [655]:
predict_new_data_func = tf.estimator.inputs.pandas_input_fn(x=test,batch_size=10,num_epochs=1,shuffle=False)

pred = model.predict(predict_new_data_func)

new_data_predictions = list(pred)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MOB005~1.YOS\AppData\Local\Temp\tmpgcrbp4_p\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [656]:
new_preds = []
for pred in new_data_predictions:
    new_preds.append(pred['predictions'])

In [657]:
sample = pd.read_csv('SampleSubmission.csv')

submission = pd.DataFrame({'Product_Supermarket_Identifier':sample['Product_Supermarket_Identifier'],
                                  'Product_Supermarket_Sales': new_preds},
                                 columns=['Product_Supermarket_Identifier','Product_Supermarket_Sales'])

#to csv
submission.to_csv("003TF.csv",index=False)