In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', None)

sheet_url = "https://docs.google.com/spreadsheets/d/1GFnLv36kMLcE9T5LxaCnMStQ3NVg_YLcXWRYr95NzRE/edit#gid=918477569"
url = sheet_url.replace("/edit#gid=", "/export?format=csv&gid=")

In [2]:
data = pd.read_csv(url)
data

Unnamed: 0,vol%,Type of Graphene,Electrostatic Self-Assembly,In-Situ Growth,Ball Mill Speed,Ball Mill Time,Hot Pressing,Molecular Level Mixing,Stirring,Sintering,Sonication,Hot Pressing.1,Equal Speed Rolling,HRDS Rolling,Yield Strength (MPa)
0,0.0,,0,0,0,0,0,0,0,0,0,0,1,0,316.2
1,0.0,,0,0,0,0,0,0,1,1,0,0,0,0,95.0
2,0.0,,0,0,400,5,0,0,0,1,0,0,0,0,168.0
3,0.6,RGO,0,0,0,0,0,1,0,1,0,0,0,0,175.0
4,3.0,graphene,0,0,1200,3,1,0,0,0,0,0,0,0,181.0
5,0.5,graphene,0,0,400,4,0,0,0,0,0,0,0,1,323.4
6,5.0,graphene,0,0,1200,3,1,0,0,0,0,0,0,0,247.0
7,1.2,RGO,0,0,0,0,0,1,0,1,0,0,0,0,462.5
8,0.0,,0,0,0,0,0,0,1,1,0,0,0,0,95.0
9,4.9,graphene,0,0,0,0,0,1,0,1,0,0,0,0,363.0


### OneHotEncoder and for Type of Graphene and Manufacturing Process

In [3]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse = False)
transformed_types = ohe.fit_transform(data[['Type of Graphene']])
transformed_types_df = pd.DataFrame(transformed_types)
data = data.drop(['Type of Graphene'], axis=1)
data = pd.concat([data, transformed_types_df], axis=1)

In [4]:
X = data
X = X.to_numpy()
X = pd.DataFrame(X)
y = X.pop(13).values
X = X.to_numpy()

### StandardScaler

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


scalerX = StandardScaler()
scalerY = StandardScaler()
X = scalerX.fit_transform(X)
y = scalerY.fit_transform(pd.DataFrame(y))
    
y = np.ravel(y)

### GridSearch/Feature Importance for Optimization of Random Forest

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

### ANN

In [7]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(activation = 'relu', hidden_layer_sizes = (15, 15, 15), solver = 'adam', alpha = .0001)

In [8]:
best_grid = mlp

best_grid.fit(X, y)


predictions = best_grid.predict(X)

print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.85624813548221
MSE =  0.14375186451779
MAE =  0.2560121782520483




### GBM

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor()

In [10]:
param_grid = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [.1, .5, 1,],
    'n_estimators': [100, 200, 300, 400]
}


grid_search = GridSearchCV(estimator = gbm, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X, y)
    

Fitting 3 folds for each of 48 candidates, totalling 144 fits


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.5, 1],
                         'loss': ['squared_error', 'absolute_error', 'huber',
                                  'quantile'],
                         'n_estimators': [100, 200, 300, 400]},
             verbose=2)

In [11]:
grid_search.best_params_

{'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 100}

In [12]:
best_grid = gbm

best_grid.fit(X, y)


predictions = best_grid.predict(X)

print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.907261604141534
MSE =  0.09273839585846598
MAE =  0.2102885354430467


### KNN

In [13]:
from sklearn.neighbors import KNeighborsRegressor
knn= KNeighborsRegressor(n_neighbors = 2, weights = 'uniform')

In [14]:
best_grid = knn

best_grid.fit(X, y)


predictions = best_grid.predict(X)

print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.8571338045732053
MSE =  0.1428661954267946
MAE =  0.24227682653587046


### RF

In [15]:
forest = RandomForestRegressor()
forest.fit(X, y)
importance = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)

In [16]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [None],
    'max_features': [1,3],
    'n_estimators': [25,50,100,150,200,300,400,500,600]
}


grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X, y)


Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [None],
                         'max_features': [1, 3],
                         'n_estimators': [25, 50, 100, 150, 200, 300, 400, 500,
                                          600]},
             verbose=2)

In [17]:
grid_search.best_params_

{'bootstrap': True, 'max_depth': None, 'max_features': 3, 'n_estimators': 200}

In [18]:
best_grid = grid_search.best_estimator_

In [19]:
best_grid.fit(X, y)


predictions = best_grid.predict(X)

print( "R2 score = ", r2_score(y, predictions))
print( "MSE = ", mean_squared_error(y, predictions))
print( "MAE = ", mean_absolute_error(y, predictions))

R2 score =  0.917277597277567
MSE =  0.08272240272243299
MAE =  0.1971361223087164


### Feature Importance

In [20]:
importance = best_grid.feature_importances_
for i, v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.27487
Feature: 1, Score: 0.00448
Feature: 2, Score: 0.01867
Feature: 3, Score: 0.15716
Feature: 4, Score: 0.19244
Feature: 5, Score: 0.03758
Feature: 6, Score: 0.03269
Feature: 7, Score: 0.03196
Feature: 8, Score: 0.05436
Feature: 9, Score: 0.05360
Feature: 10, Score: 0.02775
Feature: 11, Score: 0.01858
Feature: 12, Score: 0.01510
Feature: 13, Score: 0.03036
Feature: 14, Score: 0.02821
Feature: 15, Score: 0.02220
