In [0]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [0]:
from sklearn.metrics import classification_report
def accuracy_metric_1(actual, predicted):
  target_names = ['class 0/neutral', 'class 1/positive' , 'class 2/negative']
  print(classification_report(actual, predicted, target_names=target_names))

In [0]:
# Convert string column to integer
def str_column_to_int(dataset, column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
      lookup[value] = i
  for row in dataset:
      row[column] = lookup[row[column]]
  return lookup

In [0]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
  correct = 0
  for i in range(len(actual)):
      if actual[i] == predicted[i]:
          correct += 1
  return correct / float(len(actual)) * 100.0

In [0]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  actual_acc = list()
  pred_acc = list()
  for fold in folds:
      train_set = list(folds)
      train_set.remove(fold)
      train_set = sum(train_set, [])
      test_set = list()
      for row in fold:
          row_copy = list(row)
          test_set.append(row_copy)
          row_copy[-1] = None
      predicted = algorithm(train_set, test_set, *args)
      actual = [row[-1] for row in fold]
      accuracy = accuracy_metric(actual, predicted)
      actual_acc.extend(actual)
      pred_acc.extend(predicted)
      scores.append(accuracy)
  return scores,actual_acc,pred_acc

In [0]:
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
  left, right = list(), list()
  for row in dataset:
      if row[index] < value:
          left.append(row)
      else:
          right.append(row)
  return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, class_values):
  gini = 0.0
  for class_value in class_values:
      for group in groups:
          size = len(group)
          if size == 0:
              continue;
          proportion = [row[-1] for row in group].count(class_value) / float(size)
          gini += (proportion * (1.0 - proportion))
  return gini

In [0]:
# Select the best split point for a dataset
def get_split(dataset, n_features):
  class_values = list(set(row[-1] for row in dataset))
  b_index, b_value, b_score, b_groups = 999, 999, 999, None
  features = list()
  while len(features) < n_features:
      index = randrange(len(dataset[0])-1)
      if index not in features:
          features.append(index)
  for index in features:
      for row in dataset:
          groups = test_split(index, row[index], dataset)
          gini = gini_index(groups, class_values)
          if gini < b_score:
              b_index, b_value, b_score, b_groups = index, row[index], gini, groups
  return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [0]:
# Create a terminal node value
def to_terminal(group):
  outcomes = [row[-1] for row in group]
  return max(set(outcomes), key=outcomes.count)

In [0]:
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
  left, right = node['groups']
  del(node['groups'])
  # check for a no split
  if not left or not right:
      node['left'] = node['right'] = to_terminal(left + right)
      return
  # check for max depth
  if depth >= max_depth:
      node['left'], node['right'] = to_terminal(left), to_terminal(right)
      return
  # process left child
  if len(left) <= min_size:
      node['left'] = to_terminal(left)
  else:
      node['left'] = get_split(left, n_features)
      split(node['left'], max_depth, min_size, n_features, depth+1)
  # process right child
  if len(right) <= min_size:
      node['right'] = to_terminal(right)
  else:
      node['right'] = get_split(right, n_features)
      split(node['right'], max_depth, min_size, n_features, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
  root = get_split(dataset, n_features)
  split(root, max_depth, min_size, n_features, 1)
  return root

# Make a prediction with a decision tree
def predict(node, row):
  if row[node['index']] < node['value']:
      if isinstance(node['left'], dict):
          return predict(node['left'], row)
      else:
          return node['left']
  else:
      if isinstance(node['right'], dict):
          return predict(node['right'], row)
      else:
          return node['right']

# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
  sample = list()
  n_sample = round(len(dataset) * ratio)
  while len(sample) < n_sample:
      index = randrange(len(dataset))
      sample.append(dataset[index])
  return sample

# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
  predictions = [predict(tree, row) for tree in trees]
  return max(set(predictions), key=predictions.count)

# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
  trees = list()
  for i in range(n_trees):
      sample = subsample(train, sample_size)
      tree = build_tree(sample, max_depth, min_size, n_features)
      trees.append(tree)
  predictions = [bagging_predict(trees, row) for row in test]
  return(predictions)

In [0]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
  #print("CVS:---",len(dataset))
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = len(dataset) / n_folds
  for i in range(n_folds):
      fold = list()
      while len(fold) < fold_size and len(dataset_copy)!=0:
        #print(len(dataset_copy))
        index = randrange(len(dataset_copy))
        fold.append(dataset_copy.pop(index))
      dataset_split.append(fold)
  #print(dataset_split)
  return dataset_split

In [0]:
# Test the random forest algorithm
seed(1)
import pandas as pd
filename = '/content/sample_data/input_to_rf_dmm.csv'
df = pd.read_csv(filename)
dataset = df.loc[0:999].values.tolist()
print(len(dataset))

str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 2
max_depth = 20
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))


print(dataset[0:1])
print([dataset[i][-1] for i in range(len(dataset))])

1000
[[-0.11889293044805528, 0.013231215998530388, -0.10595610737800598, 0.027026152238249782, -0.17433972656726834, -0.16811133921146393, -0.026528632268309597, 0.03839939832687378, -0.16921395063400269, 0.15977764129638672, -0.08318113535642624, 0.08099965751171112, -0.14149728417396545, 0.07562804222106934, -0.022622657939791683, -0.07053302228450775, -0.09800801426172256, 0.037071261554956436, -0.09942761063575743, -0.2110423445701599, 0.1078796088695526, -0.0015891707735136151, -0.13122117519378662, -0.1054023951292038, -0.11274103820323945, -0.06639648973941803, -0.18663763999938965, -0.2928256094455719, 0.05111819878220558, -0.031895995140075684, -0.008194200694561005, 0.17972390353679654, -0.24655677378177646, 0.07704958319664001, -0.2207634598016739, 0.14440061151981354, -0.1138717383146286, -0.09898284077644348, -0.11844801902770995, 0.05110904946923256, 0.13577492535114288, 0.2125098407268524, -0.1361551433801651, -0.035230103880167014, -0.1175428107380867, 0.128366291522979

In [0]:
for n_trees in [10]:
  scores, actual_acc, pred_acc = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
  print('Trees: %d' % n_trees)
  print('Scores: %s' % scores)
  print(accuracy_metric_1(actual_acc, pred_acc))
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))