In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
import time

Loading the data

In [3]:
data=pd.read_csv('./roboBohr.csv')

Printing the first few lines of the data to see what we are dealing with

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,1267,1268,1269,1270,1271,1272,1273,1274,pubchem_id,Eat
0,0,73.516695,17.817765,12.469551,12.45813,12.454607,12.447345,12.433065,12.426926,12.387474,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,25004,-19.013763
1,1,73.516695,20.649126,18.527789,17.891535,17.887995,17.871731,17.852586,17.729842,15.86427,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25005,-10.161019
2,2,73.516695,17.830377,12.512263,12.404775,12.394493,12.391564,12.324461,12.238106,10.423249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25006,-9.376619
3,3,73.516695,17.87581,17.871259,17.862402,17.85092,17.85044,12.558105,12.557645,12.517583,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25009,-13.776438
4,4,73.516695,17.883818,17.868256,17.864221,17.81854,12.508657,12.490519,12.450098,10.597068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25011,-8.53714


Removing two columns: 'Unnamed:0' and 'pubchem_id' as they serve no purpose in prediction

In [5]:
data.pop('pubchem_id')
data.pop('Unnamed: 0')

0            0
1            1
2            2
3            3
4            4
         ...  
16237    16268
16238    16269
16239    16270
16240    16271
16241    16272
Name: Unnamed: 0, Length: 16242, dtype: int64

Printing the information about the dataset. This will tell us whether there are any missing or nan values(Which there aren't in this case)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16242 entries, 0 to 16241
Columns: 1276 entries, 0 to Eat
dtypes: float64(1276)
memory usage: 158.1 MB


Seeing the size of the dataset

Initializing the scaler to scale values using (MinMaxScaler used here), it will reduce the computation load on our algorithm. The range has been set from -1 to 1 instead of 0 to 1 because sign is extremely important when studying atomization energy in chemistry and this preserves that information.

In [7]:
scaler=MinMaxScaler(feature_range=(-1,1))

Using the scaler to transform the columns one at a time as different columns have different range of values and have to be scaled in a different manner. Therefore we cannot directly apply one scaler to the entire dataset all at once

In [8]:
for column in data.columns:
    scaler.fit(data[column].values.reshape(-1,1))
    data[column]=scaler.transform(data[column].values.reshape(-1,1))

Printing the dataset to make sure the transformations have been applied

In [9]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1266,1267,1268,1269,1270,1271,1272,1273,1274,Eat
0,-0.791217,-0.577917,-0.62367,-0.624013,-0.624118,-0.624334,-0.624742,-0.624896,-0.626085,-0.590298,...,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.623118
1,-0.791217,-0.497774,-0.440832,-0.460032,-0.460138,-0.460624,-0.461169,-0.464828,-0.521138,-0.495487,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.16534
2,-0.791217,-0.57756,-0.622381,-0.625623,-0.625933,-0.626018,-0.62802,-0.630595,-0.685375,-0.711796,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.235201
3,-0.791217,-0.576274,-0.460646,-0.460911,-0.461257,-0.461267,-0.620968,-0.62095,-0.622157,-0.587709,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.156662
4,-0.791217,-0.576048,-0.460737,-0.460856,-0.462234,-0.622484,-0.623008,-0.624196,-0.680128,-0.648943,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.309968


Randomizing our dataset(Fraction=1 meaning the entire dataset will be shuffled)

In [10]:
data=data.sample(frac=1)

Printing our randomized dataset

In [11]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1266,1267,1268,1269,1270,1271,1272,1273,1274,Eat
6284,-1.0,-0.695796,-0.587947,-0.587952,-0.587968,-0.587964,-0.587945,-0.614566,-0.623413,-0.738833,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.360438
6353,-0.791217,-0.579328,-0.585382,-0.585396,-0.587895,-0.587947,-0.587947,-0.587912,-0.619067,-0.587704,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.001394
13382,-0.791217,-0.403664,-0.379656,-0.475385,-0.51918,-0.52229,-0.535174,-0.539838,-0.608857,-0.573141,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.319892
7606,-0.791217,-0.580239,-0.572353,-0.616897,-0.617698,-0.618657,-0.62336,-0.623355,-0.624158,-0.588574,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.012995
8030,-0.791217,-0.496054,-0.515946,-0.538512,-0.538579,-0.618405,-0.620527,-0.620516,-0.72662,-0.699964,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.418809


Making a separate output variable

In [12]:
y=data.pop('Eat')

Dividing our data into train and test batch

In [13]:
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.25)

Start timer for Random Forest

In [14]:
start_rf = time.time()

Making the Random Forest model and training it using our training data

In [15]:
model_rf = RandomForestRegressor(n_estimators=25)
model_rf.fit(X_train, y_train)

End timer and calculate elapsed time

In [16]:
end_rf = time.time()
rf_train_time = end_rf - start_rf

Start timer for Decision Tree

In [17]:
start_dt = time.time()

Train Decision Tree model

In [18]:
model_dt = DecisionTreeRegressor()
model_dt.fit(X_train, y_train)

End timer and calculate elapsed time

In [19]:
end_dt = time.time()
dt_train_time = end_dt - start_dt

Evaluate Random Forest model

In [20]:
rf_acc = model_rf.score(X_test, y_test)

Evaluate Decision Tree model

In [21]:
dt_acc = model_dt.score(X_test, y_test)

Print accuracies and training times

In [22]:
print(f"Random Forest accuracy on test set: {rf_acc * 100:.2f}%")
print(f"Random Forest training time: {rf_train_time:.2f} seconds")
print(f"Decision Tree accuracy on test set: {dt_acc * 100:.2f}%")
print(f"Decision Tree training time: {dt_train_time:.2f} seconds")

Random Forest accuracy on test set: 99.81%
Random Forest training time: 258.60 seconds
Decision Tree accuracy on test set: 99.61%
Decision Tree training time: 31.45 seconds


Comparison

In [23]:
if rf_acc > dt_acc:
    print("Random Forest outperforms Decision Tree.")
elif rf_acc < dt_acc:
    print("Decision Tree outperforms Random Forest.")
else:
    print("Both models have the same test accuracy.")

Random Forest outperforms Decision Tree.
