In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
pd.set_option('display.max_rows', None)

sheet_url = "https://docs.google.com/spreadsheets/d/1GFnLv36kMLcE9T5LxaCnMStQ3NVg_YLcXWRYr95NzRE/edit#gid=2035550178"
url = sheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

In [2]:
data = pd.read_csv(url)
data

Unnamed: 0,vol%,Type of Graphene,Mixing,Ball Milling,Electrostatic Self-Assmbly,Hot Pressing,Sonication,In Situ Growth,Stirring,Rolling,SPS,Tensile Strength (MPa)
0,0.0,,0,1,0,0,0,0,0,1,0,365
1,0.0,,0,0,0,0,0,1,0,0,1,208
2,0.3,RGO,1,0,0,0,0,0,0,0,1,170
3,0.8,graphene,1,0,0,0,0,0,0,0,1,170
4,0.5,graphene,0,0,0,0,0,1,0,0,1,335
5,0.0,,0,1,0,0,0,0,0,0,1,47
6,0.5,graphene,0,1,0,0,0,0,0,1,0,401
7,0.0,,0,1,0,0,0,0,0,1,0,384
8,0.5,graphene,1,0,0,0,0,0,0,0,1,179
9,1.0,graphene,0,1,0,0,0,0,0,1,0,378


### OneHotEncoder for Type of Graphene and Manufacturing Process

In [3]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse = False)
transformed_types = ohe.fit_transform(data[['Type of Graphene']])
transformed_types_df = pd.DataFrame(transformed_types)
data = data.drop(['Type of Graphene'], axis=1)
data = pd.concat([data, transformed_types_df], axis=1)

In [4]:
X = data
X = X.to_numpy()
X = pd.DataFrame(X)
y = X.pop(10).values
X = X.to_numpy()

### StandardScaler

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


scalerX = StandardScaler()
scalerY = StandardScaler()
X = scalerX.fit_transform(X)
y = scalerY.fit_transform(pd.DataFrame(y))
    
y = np.ravel(y)

### GridSearch/Feature Importance for Optimization of Random Forest

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [13]:
forest = RandomForestRegressor()
forest.fit(X, y)
importance = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)


In [14]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [None],
    'max_features': [1,3],
    'n_estimators': [25,50, 100, 150, 200, 300, 400, 500, 600]
}


grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X, y)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [None],
                         'max_features': [1, 3],
                         'n_estimators': [25, 50, 100, 150, 200, 300, 400, 500,
                                          600]},
             verbose=2)

In [15]:
grid_search.best_params_

{'bootstrap': True, 'max_depth': None, 'max_features': 1, 'n_estimators': 25}

In [16]:
best_grid = grid_search.best_estimator_

In [17]:
best_grid.fit(X, y)


predictions = best_grid.predict(X)

print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.8804945498323739
MSE =  0.11950545016762609
MAE =  0.23709661359395204


In [18]:
importance = best_grid.feature_importances_
for i, v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.17382
Feature: 1, Score: 0.08669
Feature: 2, Score: 0.04860
Feature: 3, Score: 0.00686
Feature: 4, Score: 0.06012
Feature: 5, Score: 0.01677
Feature: 6, Score: 0.03994
Feature: 7, Score: 0.00892
Feature: 8, Score: 0.41567
Feature: 9, Score: 0.07697
Feature: 10, Score: 0.01536
Feature: 11, Score: 0.02200
Feature: 12, Score: 0.02829


### GBM

In [25]:
from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor()

In [26]:
param_grid = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [.1, .5, 1,],
    'n_estimators': [100, 200, 300, 400]
}


grid_search = GridSearchCV(estimator = gbm, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(x_train, y_train)
    

Fitting 3 folds for each of 48 candidates, totalling 144 fits


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.5, 1],
                         'loss': ['squared_error', 'absolute_error', 'huber',
                                  'quantile'],
                         'n_estimators': [100, 200, 300, 400]},
             verbose=2)

In [27]:
grid_search.best_params_

{'learning_rate': 0.1, 'loss': 'absolute_error', 'n_estimators': 100}

In [28]:
best_grid = gbm

best_grid.fit(X, y)


predictions = best_grid.predict(X)


print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.8932236239543736
MSE =  0.1067763760456264
MAE =  0.21241313225957706


### KNN

In [29]:
from sklearn.neighbors import KNeighborsRegressor
knn= KNeighborsRegressor(n_neighbors = 2, weights = 'uniform')

In [30]:
best_grid = knn

best_grid.fit(X, y)


predictions = best_grid.predict(X)

print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.8237579476400134
MSE =  0.17624205235998644
MAE =  0.27551396465541855


### ANN

In [31]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(activation = 'relu', hidden_layer_sizes = (25, 25, 25), solver = 'adam', alpha = .001)

In [32]:
best_grid = mlp

best_grid.fit(X, y)


predictions = best_grid.predict(X)


print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.8949769843547337
MSE =  0.10502301564526619
MAE =  0.20437374597760813
