In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_path = 'rct.csv'

df = pd.read_csv(data_path)

In [3]:
# drops weird moisture values
df['cull'] = df['cull'].astype(float)
df = df[df.moisture.str.contains('#DIV/0!') == False]
df = df[df.tsi.str.contains('#DIV/0!') == False]
# drops empty and 0 value rows
df = df[np.isfinite(df['stfi'])]
df = df.loc[(df != 0).all(axis=1), :]
#only get usp 120
#df = df[df.grade.str.contains('NL050550')]
df = df.loc[df['cull'] == 89.5]
df = df.reset_index(drop=True)

In [4]:
df[0:200]

Unnamed: 0,grade,label,rct,tsi,stfi,caliper,moisture,basisweight,cull
0,NL050420,3/19/2017 9:41,87.0,4.70,22.18,10.92,6.07,38.57,89.5
1,NL050420,3/19/2017 8:42,97.0,4.97,20.09,11.74,4.48,40.13,89.5
2,NL050420,2/22/2017 18:59,92.0,5.08,23.34,11.23,6.25,40.51,89.5
3,NL050420,1/14/2017 1:15,96.0,4.43,21.56,12.17,5.72,40.61,89.5
4,NL050420,3/9/2017 22:06,89.0,5.08,22.61,11.30,6.56,40.80,89.5
5,NL050420,3/10/2017 7:17,93.0,5.03,25.03,11.40,6.32,40.82,89.5
6,NL050420,2/22/2017 18:16,94.0,5.18,23.50,11.26,6.76,40.97,89.5
7,NL050420,2/18/2017 13:38,96.0,5.19,24.24,11.41,5.94,40.98,89.5
8,NL050420,3/9/2017 19:59,98.0,5.02,24.54,11.03,6.50,40.98,89.5
9,NL050420,2/22/2017 13:59,97.0,5.42,26.11,11.27,6.71,41.09,89.5


In [5]:
df['tsi'] = df['tsi'].astype(float)
df['moisture'] = df['moisture'].astype(float)
df['cull'] = df['cull'].astype(float)

In [6]:
# make binary numbers for grades
'''
dummy_fields = ['grade']
for each in dummy_fields:
    dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
    df = pd.concat([df, dummies], axis=1) '''
    
fields_to_drop = ['label', 'grade', 'cull']
data = df.drop(fields_to_drop, axis=1)

In [7]:
quant_features = ['rct','tsi', 'stfi', 'caliper','moisture','basisweight']
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for each in quant_features:
    mean, std = (data[each]).mean(), data[each].std()
    scaled_features[each] = [mean, std]
    data.loc[:, each] = (data[each] - mean)/std
data.head()

Unnamed: 0,rct,tsi,stfi,caliper,moisture,basisweight
0,-1.267785,-0.762781,-0.696583,-1.264918,-1.121601,-1.536229
1,-0.042661,0.291929,-1.605228,-0.587739,-5.137175,-1.032106
2,-0.655223,0.721625,-0.192264,-1.008911,-0.667007,-0.909307
3,-0.165173,-1.817491,-0.966133,-0.232633,-2.005532,-0.876992
4,-1.02276,0.721625,-0.509637,-0.951103,0.115903,-0.815592


In [8]:

test_data = data[999:]

data = data[:999]

target_fields = ['rct']
features, targets = data.drop(target_fields, axis=1), data[target_fields]
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]

In [9]:

train_features, train_targets = features[:899], targets[:899]
val_features, val_targets = features[899:], targets[899:]
#features.head()

In [10]:
class NeuralNetwork(object):
    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_input_to_hidden = np.random.normal(0.0, self.input_nodes**-0.5, 
                                       (self.input_nodes, self.hidden_nodes))

        self.weights_hidden_to_output = np.random.normal(0.0, self.hidden_nodes**-0.5, 
                                       (self.hidden_nodes, self.output_nodes))
        self.lr = learning_rate
        
        self.activation_function = lambda x : 1/(1 + np.exp(-x))   # Replace 0 with your sigmoid calculation.
        
                    
    
    def train(self, features, targets):

        n_records = features.shape[0]
        delta_weights_i_h = np.zeros(self.weights_input_to_hidden.shape)
        delta_weights_h_o = np.zeros(self.weights_hidden_to_output.shape)
        for X, y in zip(features, targets):
            hidden_inputs = np.dot(X,self.weights_input_to_hidden) # signals into hidden layer
            hidden_outputs = self.activation_function(hidden_inputs) # signals from hidden layer
            
            final_inputs = np.dot(hidden_outputs,self.weights_hidden_to_output) # signals into final output layer
            final_outputs = final_inputs # signals from final output layer'this
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            error = y - final_outputs # Output layer error is the difference between desired target and actual output.

            
            output_error_term = error * 1

            hidden_error = np.dot(self.weights_hidden_to_output, error)
            hidden_error_term = hidden_error * hidden_outputs * (1- hidden_outputs)

            delta_weights_i_h += hidden_error_term * X[:,None]
         
            # Weight step (hidden to output)
            hidden_outputs = hidden_outputs[:,None]
            delta_weights_h_o += output_error_term * hidden_outputs
            #print('delta hidden to out: ' + str(delta_weights_h_o))
        self.weights_hidden_to_output += self.lr * delta_weights_h_o/n_records # update hidden-to-output weights with gradient descent step
        self.weights_input_to_hidden += self.lr * delta_weights_i_h/n_records # update input-to-hidden weights with gradient descent step
 
    def run(self, features):

        hidden_inputs = np.dot(features,self.weights_input_to_hidden) # signals into hidden layer
        hidden_outputs = self.activation_function(hidden_inputs) # signals from hidden layer
        
        final_inputs = np.dot(hidden_outputs,self.weights_hidden_to_output) # signals into final output layer
        final_outputs = (final_inputs) # signals from final output layer 
        
        return final_outputs

In [11]:
def MSE(y, Y):
    return np.mean((y-Y)**2)

In [21]:
import sys

### Set the hyperparameters here ###
iterations = 2000
learning_rate = 0.001
hidden_nodes = 3200
output_nodes = 1

N_i = train_features.shape[1]
network = NeuralNetwork(N_i, hidden_nodes, output_nodes, learning_rate)

losses = {'train':[], 'validation':[]}
for ii in range(iterations):
    # Go through a random batch of 128 records from the training data set
    batch = np.random.choice(train_features.index, size=128)
    X, y = train_features.ix[batch].values, train_targets.ix[batch]['rct']
                             
    network.train(X, y)
    
    # Printing out the training progress
    train_loss = MSE(network.run(train_features).T, train_targets['rct'].values)
    val_loss = MSE(network.run(val_features).T, val_targets['rct'].values)
    sys.stdout.write("\rProgress: {:2.1f}".format(100 * ii/float(iterations)) \
                     + "% ... Training loss: " + str(train_loss)[:5] \
                     + " ... Validation loss: " + str(val_loss)[:5])
    sys.stdout.flush()
    
    losses['train'].append(train_loss)
    losses['validation'].append(val_loss)

Progress: 0.2% ... Training loss: 0.554 ... Validation loss: nan

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Progress: 100.0% ... Training loss: 0.219 ... Validation loss: nan

In [22]:
plt.plot(losses['train'], label='Training loss')
plt.plot(losses['validation'], label='Validation loss')
plt.legend()
_ = plt.ylim()

In [23]:
weights_in = []
for each in network.weights_input_to_hidden:
    for weight in each:
        weights_in.append(weight)
    
weights_out = []
for each in network.weights_hidden_to_output:
    for weight in each:
        weights_out.append(weight)

In [24]:
testFeatures = features.reset_index(drop=True)
testTargets = targets.reset_index(drop=True)
mean, std = scaled_features['rct']
columns = ['tsi','stfi','caliper','moisture','basisweight']
newdata = pd.DataFrame(columns=columns)
xs = []
ys = []
tsi = []
stfi = []
caliper = []
moisture = []
basisweight = []
#cull = []
basismean, basisstd = scaled_features['basisweight']
calipermean, caliperstd = scaled_features['caliper']
#cullmean, cullstd = scaled_features['cull']
moisturemean, moisturestd = scaled_features['moisture']
stfimean, stfistd = scaled_features['stfi']
tsimean, tsistd = scaled_features['tsi']
for index, row in features.iterrows():
    prediction = network.run(row).T*std+mean
    xs.append(prediction)
    tsi.append(row['tsi']*tsistd+tsimean)
    stfi.append(row['stfi']*stfistd+stfimean)
    caliper.append(row['caliper']*caliperstd+calipermean)
    moisture.append(row['moisture']*moisturestd+moisturemean)
    basisweight.append(row['basisweight']*basisstd+basismean)
    #cull.append(row['cull']*cullstd+cullmean)

ys = testTargets['rct'].T*std+mean

In [25]:
data.head()

Unnamed: 0,rct,tsi,stfi,caliper,moisture,basisweight
0,-1.267785,-0.762781,-0.696583,-1.264918,-1.121601,-1.536229
1,-0.042661,0.291929,-1.605228,-0.587739,-5.137175,-1.032106
2,-0.655223,0.721625,-0.192264,-1.008911,-0.667007,-0.909307
3,-0.165173,-1.817491,-0.966133,-0.232633,-2.005532,-0.876992
4,-1.02276,0.721625,-0.509637,-0.951103,0.115903,-0.815592


In [26]:
from sklearn import linear_model
#train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(xs, ys)

#visualize results
plt.scatter(xs, ys)
plt.plot(xs, body_reg.predict(xs))
plt.show()
print('r squared value:')
print(body_reg.score(xs,ys))

ModuleNotFoundError: No module named 'sklearn'

In [27]:


bs=[]
for x in xs:
    bs.append(x[0])
newdata['tsi'] = tsi
newdata['stfi'] = stfi
newdata['caliper'] = caliper
newdata['moisture'] = moisture
#newdata['cull'] = cull
newdata['basisweight'] = basisweight
newdata['rct'] = ys
newdata['prediction'] = bs
newdata.to_csv('t1_usp_90_comparison.csv')



PermissionError: [Errno 13] Permission denied: 't1_usp_90_comparison.csv'

In [20]:
import pickle
with open('weight_in_no_grades_usp90', 'wb') as f:
    pickle.dump(network.weights_input_to_hidden, f)
with open('weight_out_no_grades_usp90', 'wb') as f:
    pickle.dump(network.weights_hidden_to_output, f)

import json
# save to file:
with open('variables_usp90.json', 'w') as f:
    json.dump(scaled_features, f)