Preprocess data

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load training data set from CSV file
training_data_df =pd.read_csv("sales_data_training.csv")

# Load testing data set from CSV file
test_data_df =pd.read_csv("sales_data_test.csv")

In [4]:
training_data_df.head(3)

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,0,1,0,1,0,0,132717,59.99
1,4.5,0,0,0,0,1,1,0,83407,49.99
2,3.0,0,0,0,0,1,1,0,62423,49.99


In [5]:
test_data_df.head(3)

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,1,1,0,1,0,1,247537,59.99
1,2.5,0,0,0,1,1,0,0,73960,59.99
2,3.5,0,0,0,0,1,1,0,82671,59.99


In [6]:
# Data needs to be scaled to a small range like 0 to 1 for the neural
# network to work well.
scaler = MinMaxScaler(feature_range=(0,1))

In [7]:
# Scale both the training inputs and outputs
scaled_training = scaler.fit_transform(training_data_df)
scaled_testing = scaler.transform(test_data_df)

# Print out the adjustment that the scaler applied to the total_earnings column of data
print("Note: total_earnings values were scaled by multiplying by {:.10f} and adding {:.6f}".format(scaler.scale_[8], scaler.min_[8]))

Note: total_earnings values were scaled by multiplying by 0.0000036968 and adding -0.115913


In [9]:
type(scaled_testing)

numpy.ndarray

In [10]:
# Create new pandas DataFrame objects from the scaled data
scaled_training_df = pd.DataFrame(scaled_training, columns=training_data_df.columns.values)
scaled_testing_df = pd.DataFrame(scaled_testing, columns=test_data_df.columns.values)

In [12]:
scaled_testing_df.head(3)

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,0.5,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.799179,1.0
1,0.166667,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.157502,1.0
2,0.5,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.189704,1.0


In [11]:
# Save scaled data dataframes to new CSV files
scaled_training_df.to_csv("sales_data_training_scaled.csv", index=False)
scaled_testing_df.to_csv("sales_data_test_scaled.csv", index=False)

In [13]:
training_data_df = pd.read_csv("sales_data_training_scaled.csv")

In [15]:
training_data_df.head(3)

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,0.5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.374714,1.0
1,0.833333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.192425,0.5
2,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.114852,0.5


In [22]:
X = training_data_df.drop('total_earnings', axis=1).values
Y = training_data_df[['total_earnings']].values

In [24]:
# Define the model
from keras.models import Sequential
from keras.layers import *
model = Sequential()
model.add(Dense(50,input_dim = 9,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(1,activation='linear'))
model.compile(loss="mean_squared_error",optimizer="adam")

In [26]:
#Train the model
model.fit(X,Y,
          epochs=50,
          shuffle=True,
          verbose=2)

Epoch 1/50
32/32 - 0s - loss: 1.7840e-05
Epoch 2/50
32/32 - 0s - loss: 2.5212e-05
Epoch 3/50
32/32 - 0s - loss: 1.7033e-05
Epoch 4/50
32/32 - 0s - loss: 1.6258e-05
Epoch 5/50
32/32 - 0s - loss: 2.2085e-05
Epoch 6/50
32/32 - 0s - loss: 1.9872e-05
Epoch 7/50
32/32 - 0s - loss: 1.8374e-05
Epoch 8/50
32/32 - 0s - loss: 2.5830e-05
Epoch 9/50
32/32 - 0s - loss: 3.3400e-05
Epoch 10/50
32/32 - 0s - loss: 2.4125e-05
Epoch 11/50
32/32 - 0s - loss: 2.1272e-05
Epoch 12/50
32/32 - 0s - loss: 2.0870e-05
Epoch 13/50
32/32 - 0s - loss: 3.9911e-05
Epoch 14/50
32/32 - 0s - loss: 4.3284e-05
Epoch 15/50
32/32 - 0s - loss: 2.3923e-05
Epoch 16/50
32/32 - 0s - loss: 2.2724e-05
Epoch 17/50
32/32 - 0s - loss: 2.2219e-05
Epoch 18/50
32/32 - 0s - loss: 2.4366e-05
Epoch 19/50
32/32 - 0s - loss: 3.2197e-05
Epoch 20/50
32/32 - 0s - loss: 3.2908e-05
Epoch 21/50
32/32 - 0s - loss: 2.0300e-05
Epoch 22/50
32/32 - 0s - loss: 1.8905e-05
Epoch 23/50
32/32 - 0s - loss: 2.1425e-05
Epoch 24/50
32/32 - 0s - loss: 3.2572e-05
E

<keras.callbacks.History at 0x7fbd9beae090>

In [27]:
# Load the separate test data set
test_data_df = pd.read_csv("sales_data_test_scaled.csv")

X_test = test_data_df.drop('total_earnings', axis=1).values
Y_test = test_data_df[['total_earnings']].values

test_error_rate =model.evaluate(X_test,Y_test,verbose=0)
print("The mean squared error (MSE) for the test data set is: {}".format(test_error_rate))

The mean squared error (MSE) for the test data set is: 0.00013444724027067423


In [28]:
# Load the data we make to use to make a prediction
X = pd.read_csv("proposed_new_product.csv").values

# Make a prediction with the neural network
prediction =model.predict(X)

# Grab just the first element of the first prediction (since that's the only have one)
prediction = prediction[0][0]

# Re-scale the data from the 0-to-1 range back to dollars
# These constants are from when the data was originally scaled down to the 0-to-1 range
prediction = prediction + 0.1159
prediction = prediction / 0.0000036968

print("Earnings Prediction for Proposed Product - ${}".format(prediction))

Earnings Prediction for Proposed Product - $261516.35174213667


In [29]:
model.save("trained_model.h5")
print("Model saved to disk.")

Model saved to disk.


In [30]:
from keras.models import load_model

In [31]:
model=load_model("trained_model.h5")

In [32]:
X = pd.read_csv("proposed_new_product.csv").values
prediction = model.predict(X)

# Grab just the first element of the first prediction (since we only have one)
prediction = prediction[0][0]

# Re-scale the data from the 0-to-1 range back to dollars
# These constants are from when the data was originally scaled down to the 0-to-1 range
prediction = prediction + 0.1159
prediction = prediction / 0.0000036968

print("Earnings Prediction for Proposed Product - ${}".format(prediction))

Earnings Prediction for Proposed Product - $261516.35174213667
