!! Data Class is edited


The goal here is to apply RFC on statical data after filtering the data

### Importing

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures
from sklearn import decomposition
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, auc, roc_curve
from sklearn.preprocessing import StandardScaler

import os

from tqdm.auto import tqdm

import shutil

from itertools import groupby

import matplotlib.pyplot as plt

from statistics import mean

import scipy.stats as stats
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.fftpack import fft
from scipy import signal

import tensorflow as tf
from tensorflow import keras

import pickle

from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Input, Flatten, Conv1D, MaxPooling1D, Concatenate, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.utils.vis_utils import plot_model


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# path of original data folder
data_path = "/content/drive/MyDrive/ITMO-Master's/Thesis/3rd_semester/Data"
# path of csv files, each file is a 2mins walk for a subject
csvD_path = "/content/drive/MyDrive/ITMO-Master's/Thesis/3rd_semester/Data/csv_files"
# In this file, the Null values are replaved with 0 in the HY scale - Null values were given to healthy control
demographics = "/content/drive/MyDrive/ITMO-Master's/Thesis/3rd_semester/Data/Processed_data/demographics_HYprocessed.csv" 
dem_df = pd.read_csv(demographics)
# parequet folder path
parquet_path = "/content/drive/MyDrive/ITMO-Master's/Thesis/3rd_semester/Data/parquet_files/"

## Data

In [17]:
class Data:
  def __init__(self, prepare_or_get, data_folder, demographics_file, parquet_path):
    '''
    :param prepare_or_get:  1: prepare data , 0: load preloaded datas (npy)
    :param data_folder: path for a folder containing csv files, file for each 2 mins walk (for indivdiual subject)
    :param demographics_file: path for csv demographics file
    :save_path: path where to save/read parquet files after preparing them
    :
    '''

    self.data_path = data_folder
    self.parquet_path= parquet_path
    self.dem_path= demographics_file
    self.scale="HoehnYahr"            # fixed for now
    self.seconds_to_discard = 20      # fixed for now
    self.rows_to_discard = 100 * self.seconds_to_discard
    
    # data in arrays
    self.signals_data = []
    self.cycles_data=[]
    self.statical_data=[]
    self.raw_statics=[]
  

    if (prepare_or_get==1):
      self.prepare_data()
    elif (prepare_or_get==0):
      self.get_data()
  
  def get_data(self):
    self.signals_data_df= pd.read_parquet(self.parquet_path+'signals.gzip')
    self.cycles_data_df=pd.read_parquet(self.parquet_path+'cyclels.gzip')
    self.statical_data_df=pd.read_parquet(self.parquet_path+'statical.gzip')
    self.left_stances_df=pd.read_parquet(self.parquet_path+'left_stances.gzip')
    self.right_stances_df=pd.read_parquet(self.parquet_path+'right_stances.gzip')
    self.raw_statics_df=pd.read_parquet(self.parquet_path+'raw_statics.gzip')

  def prepare_data(self):
    # loop over files, each file is a 2 min walk for a single individual
    # files are expected to be in csv format
    self.dem_df = pd.read_csv(self.dem_path)
    for name in tqdm((os.listdir(self.data_path))):
       
      # id of subject
      id = name.split('_')[0]

      # disease level according to 'self.scale'
      level = self.get_pd_level( id)

      # parkinon's patient OR healthy control ?
      if 'Co' in name:  
        parkinson = 0 # Healthy control
      elif 'Pt' in name:
        parkinson = 1 # Parkinson's Patient

      # read and filter one file
      one_walk=self.read_filter(name)

      # sum of values from all the left sensors
      one_walk['Total_Force_Left'] = one_walk[list(one_walk.columns[0:8])].sum(axis=1)
     
      # sum of values from all the right sensors
      one_walk['Total_Force_Right'] = one_walk[list(one_walk.columns[8:16])].sum(axis=1)
      
      # convert to lists
      one_walk_lists=[]*18
      for column in one_walk.columns.values.tolist():
        one_walk_lists.append(one_walk[column].tolist()) 
      
      # Raw statics
      raw_statics_1 = self.get_statics_raw_siganls(one_walk)

      ## Left foot related ##
      l_stances, l_strides_time , l_swings_time, l_stances_time, l_indices = self.segment_signal(one_walk_lists[16])
      # Find maximum VGRF at heel strike for each gait cycle 
      l_peaks_heel = self.get_max_per_gait_cycle(one_walk_lists[0],l_indices) # sensor 0 = left heal sensor
      # Find maximum VGRF at toe off for each gait cycle
      l_peaks_toe = self.get_max_per_gait_cycle(one_walk_lists[7],l_indices) # sensor 7 = left toe sensor

      ## right foot related ##
      r_stances, r_strides_time , r_swings_time, r_stances_time, r_indices = self.segment_signal(one_walk_lists[17])
      # Find maximum VGRF at heel strike for each gait cycle 
      r_peaks_heel = self.get_max_per_gait_cycle(one_walk_lists[8],l_indices) # sensor 8 = right heal sensor
      # Find maximum VGRF at toe off for each gait cycle
      r_peaks_toe = self.get_max_per_gait_cycle(one_walk_lists[15],l_indices) # sensor 15 = right toe sensor


      ## Saving data ##

      # Raw Signals
      self.signals_data.append([id,level,parkinson] +one_walk_lists)

      # Raw Statics
      self.raw_statics.append([id,level,parkinson] +raw_statics_1)

      # Statical
      self.statical_data.append( [ id, level, parkinson 
                                , mean (l_swings_time) , np.std(l_swings_time), (mean (l_swings_time) / np.std(l_swings_time) *100)
                                , mean (l_strides_time) , np.std(l_strides_time) , (mean (l_strides_time) / np.std(l_strides_time)*100)
                                , kurtosis(l_strides_time), skew(l_strides_time)
                                , mean (l_stances_time) , np.std(l_stances_time) , ( mean (l_stances_time) / np.std(l_stances_time) *100) 

                                , mean (r_swings_time) , np.std(r_swings_time), (mean (r_swings_time) / np.std(r_swings_time) *100)
                                , mean (r_strides_time) , np.std(r_strides_time) , (mean (r_strides_time) / np.std(r_strides_time)*100)
                                , kurtosis(r_strides_time), skew(r_strides_time)
                                , mean (r_stances_time) , np.std(r_stances_time) , ( mean (r_stances_time) / np.std(r_stances_time) *100)
                                
                                , mean (l_peaks_heel) , np.std(l_peaks_heel) ,mean (l_peaks_toe) , np.std(l_peaks_toe)
                                , mean (r_peaks_heel) , np.std(r_peaks_heel) ,mean (r_peaks_toe) , np.std(r_peaks_toe)
                                  ] )
      
      
      # Segmented Signals
      
      # NEED TO BE MOVED TO SEPERATE FUNCTION
      # only for segmented data, for a trial we need each stance phase with the following 
      # swing phases's time, so here we are adjusting the phases to be equal

      dif_l =  len(l_stances)-len(l_swings_time)
      if (dif_l > 0):
        l_stances=l_stances[:-dif_l]
      elif(dif_l < 0):
        l_swings_time= l_swings_time[:dif_l]
      
      #print([len(r_peaks_toe), len(r_peaks_heel),len(r_strides_time),len(r_swings_time),len(r_stances)])
      
      min_r = min([len(r_peaks_toe), len(r_peaks_heel),len(r_strides_time),len(r_swings_time),len(r_stances)])
      if (min_r -len(r_stances) !=0): r_stances=r_stances[:(min_r -len(r_stances) )]
      if (min_r -len(r_swings_time) !=0): r_swings_time = r_swings_time[:(min_r -len(r_swings_time) )]
      if (min_r -len(r_peaks_toe) !=0): r_peaks_toe = r_peaks_toe[:(min_r -len(r_peaks_toe) )]
      if (min_r -len(r_peaks_heel) !=0): r_peaks_heel = r_peaks_heel[:(min_r -len(r_peaks_heel) )]
      if (min_r -len(r_strides_time) !=0): r_strides_time = r_strides_time[:(min_r -len(r_strides_time) )]
      
      #print([len(r_peaks_toe), len(r_peaks_heel),len(r_strides_time),len(r_swings_time),len(r_stances)])
     

      self.cycles_data.append( [ id, level, parkinson ] 
                        + [l_stances] + [l_strides_time] + [l_swings_time] + [l_stances_time]
                        + [r_stances] + [r_strides_time] + [r_swings_time] + [r_stances_time]
                        + [l_peaks_heel] + [l_peaks_toe]
                        + [r_peaks_heel] + [r_peaks_toe]
                        )

    # save to df
    self.raw_statics_data_to_df()
    self.statical_data_to_df()
    self.cycles_data_to_df()
    self.signal_data_to_df()
    ## Get individual stances data frames
    self.ind_stances_to_df()
    ## writing files ##
    self.save_files() 


  def get_pd_level(self, subject_id):
    level = self.dem_df[self.scale][dem_df['ID'] == subject_id ].values[0]
    return level

  def read_filter(self,name):
      # Reading each file, skipping 3 columns (time,total right forces & total left forces)
      # For gait cycle segmentation, to eliminate the effect of gait initiation and termenation,
      # the first and last N=20 seconds of VGRF data was discarded.

      one_walk = pd.read_csv(self.data_path + '/' + name,
                             skiprows=list(range(1,self.rows_to_discard+1)) ,
                             skipfooter=self.rows_to_discard,
                             usecols = np.arange(1,17),
                             engine = 'python')  
      
      # Usually, Vertical Ground React Force VGRF values less than 20N are generated from noise
      one_walk.where(one_walk > 20, 0, inplace=True)

      return one_walk
  
  def get_statics_raw_siganls(self, individual_df):
    static_co_df = []
    df = individual_df
    for col in df.columns:
        for x in ["Min", "Max", "Std", "Med", "Avg", "Skewness", "Kurtosis"]:
            colname = col +'_' +  x
            if x == 'Min':
                #static_co_df.loc[0, colname] = df[col].min(axis=0)
                static_co_df.append(df[col].min(axis=0))
                
            if x == 'Max':
                #static_co_df.loc[0, colname] = df[col].max(axis=0)
                static_co_df.append(df[col].max(axis=0))
                
            if x == 'Std':
                #static_co_df.loc[0, colname] = df[col].std(axis=0)
                static_co_df.append(df[col].std(axis=0))

            if x == 'Med':
                #static_co_df.loc[0, colname] = df[col].median(axis=0)
                static_co_df.append(df[col].median(axis=0))
            if x == 'Avg':
                #static_co_df.loc[0, colname] = df[col].mean(axis=0)
                static_co_df.append(df[col].mean(axis=0))
            if x == 'Skewness':
                #static_co_df.loc[0, colname] = df[col].skew(axis=0)
                static_co_df.append(df[col].skew(axis=0))
                
            if x == 'Kurtosis':
                #static_co_df.loc[0, colname] = df[col].kurtosis(axis=0)
                static_co_df.append(df[col].kurtosis(axis=0))
    return static_co_df    

  def segment_signal(self, signal):
      # swing - stance phases repeatly
      phases = [list(g) for k, g in groupby((signal), lambda x:x>0)]
      
      # Deleting the first and last phase, since the first and last 20 seconds were deleted,
      # there's no garanty that the first and last phase are complete
      # Hence, they are being deleted here just for safety

      phases = phases[1:-1]

      # filtering phases that is shorter than 20 becuase must propably it's noise
      phases = [l for l in phases if len(l)>20]

      # indices of the gait cycles
      indices=[0]
      for i in range(1, len(phases),2):
        indices.append(indices[-1]+len(phases[i]) + len(phases[i-1]))

      # stances phases
      stances = [l for l in phases if any(l)]
      
      # strides times 
      strides_time =  [((len(phases[i]) + len(phases[i-1]) -1)*0.01 ) for i in range(1,len(phases),2)]

      # swings time
      swings_time = [(len(l)-1)*0.01 for l in phases if not all (l)]

      # stance time 
      stances_time = [(len(l)-1)*0.01 for l in stances]

      return stances, strides_time , swings_time, stances_time, indices

  def get_max_per_gait_cycle(self, signal,indices):
    gait_cycles = self.get_gait_cycles(signal, indices)
    peaks = [max(cycle) for cycle in gait_cycles]
    return peaks

  def get_gait_cycles(self, signal, indices):
    # get gait cycles
    gait_cycles= [signal[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
    return gait_cycles

  def signal_data_to_df(self):
    self.signals_data_df = pd.DataFrame(self.signals_data, columns =["ID", "level", "y", "L1", "L2","L3", "L4","L5","L6","L7","L8",
                                       "R1", "R2","R3", "R4","R5","R6","R7","R8","Total_Force_Right","Total_Force_Left" ])
     
    
  def cycles_data_to_df(self):
    self.cycles_data_df = pd.DataFrame(self.cycles_data, columns =["ID", "level", "y", 
                                                                   "l_stances", "l_strides_time", "l_swings_time" , "l_stances_time"
                                                                  , "r_stances" , "r_strides_time" , "r_swings_time" , "r_stances_time"
                                                                  , "l_peaks_heel" , "l_peaks_toe"
                                                                  , "r_peaks_heel" , "r_peaks_toe" ])
    

  def statical_data_to_df(self):
    self.statical_data_df = pd.DataFrame(self.statical_data, columns =["ID", "level", "y"
                                                                      , "mean_left_swings_time" , "std_left_swings_time", "cv_left_swings_time"
                                                                      , "mean_left_stride_time" , "std_left_stride_time" ,"cv_left_stride_time"
                                                                      , "kurtosis_left_strides_time", "skew_left_strides_time"
                                                                      , "mean_left_stance_time" , "std_left_stance_time", "cv_left_stance_time"
                                                                      , "mean_right_swings_time" , "std_right_swings_time", "cv_right_swings_time"
                                                                      , "mean_right_stride_time" , "std_right_stride_time", "cv_right_stride_time"
                                                                      , "kurtosis_right_strides_time", "skew_right_strides_time"
                                                                      , "mean_right_stance_time" , "std_right_stance_time", "cv_right_stance_time"
                                                                      , "mean_left_peaks_heel" , "std_left_peaks_heel"
                                                                      , "mean_left_peaks_toe" , "std_left_peaks_toe"
                                                                      , "mean_right_peaks_heel" , "std_right_peaks_heel"
                                                                      , "mean_right_peaks_toe" , "std_right_peaks_toe"
                                                                       ])
  def raw_statics_data_to_df(self):
    colnames=["ID", "level", "y"]
    for col in ["Time"	,"L1",	"L2",	"L3"	,"L4"	"L5",	"L6",	"L7"	,"L8",	"R1",	"R2"	,"R3",	"R4",	"R5",	"R6",	"R7",	"R8",	"Total_Force_Left",	"Total_Force_Right"]:
        for x in ["Min", "Max", "Std", "Med", "Avg", "Skewness", "Kurtosis"]:
            colnames.append( col +'_' +  x)
    self.raw_statics_df = pd.DataFrame(self.raw_statics, columns =colnames)

  def ind_stances_to_df(self):
    #droping NNOT WORKING HERE
    self.left_stances_df = self.cycles_data_df.explode(["l_stances","l_swings_time"])
    self.left_stances_df = self.left_stances_df.drop(columns=["r_stances" ,	"r_strides_time" ,	"r_swings_time" ,	"r_stances_time", "r_peaks_heel" ,	"r_peaks_toe"], axis=1)

    self.right_stances_df = self.cycles_data_df.explode(["r_stances","r_swings_time","r_peaks_toe","r_peaks_heel","r_strides_time"]) 
    self.right_stances_df = self.right_stances_df.drop(columns=["l_stances" ,	"l_strides_time" ,	"l_swings_time" ,	"l_stances_time", "l_peaks_heel" ,	"l_peaks_toe"], axis=1)

  def save_files(self):
    self.signals_data_df.to_parquet(self.parquet_path+'signals.gzip')
    self.cycles_data_df.to_parquet(self.parquet_path+'cyclels.gzip')
    self.statical_data_df.to_parquet(self.parquet_path+'statical.gzip')
    self.left_stances_df.to_parquet(self.parquet_path+'left_stances.gzip')
    self.right_stances_df.to_parquet(self.parquet_path+'right_stances.gzip')
    self.raw_statics_df.to_parquet(self.parquet_path+'raw_statics.gzip')
    

In [18]:
# read prepare data
data = Data( 0, "", "",parquet_path)

In [19]:
# Prepare the data
data = Data( 1, csvD_path, demographics,parquet_path)

  0%|          | 0/304 [00:00<?, ?it/s]

## RFC on statical on features

In [8]:
data.statical_data_df.head()

Unnamed: 0,ID,level,y,mean_left_swings_time,std_left_swings_time,cv_left_swings_time,mean_left_stride_time,std_left_stride_time,cv_left_stride_time,kurtosis_left_strides_time,...,std_right_stance_time,cv_right_stance_time,mean_left_peaks_heel,std_left_peaks_heel,mean_left_peaks_toe,std_left_peaks_toe,mean_right_peaks_heel,std_right_peaks_heel,mean_right_peaks_toe,std_right_peaks_toe
0,GaPt03,3.0,1,0.448571,0.083739,535.677087,1.572,0.218367,719.890102,16.574155,...,0.121305,903.950417,70.2218,17.194543,152.3522,43.493919,106.4448,17.77783,95.8078,30.390518
1,SiPt02,2.5,1,0.331268,0.029973,1105.212394,1.119155,0.085543,1308.295978,29.451513,...,0.036636,2090.995326,185.81169,42.1811,197.03169,25.260014,179.369718,36.238324,282.351408,27.798614
2,SiPt04,2.0,1,0.455507,0.035976,1266.124714,1.157101,0.070054,1651.731664,15.11369,...,0.078629,900.765604,138.855072,32.976581,226.016522,51.476278,190.405217,32.220226,227.854638,49.962947
3,GaPt04,2.5,1,0.394,0.032052,1229.250892,1.325333,0.340888,388.788062,49.578405,...,0.195412,477.594621,253.696667,23.614942,121.377667,13.603482,218.412333,31.152661,178.979167,12.964552
4,SiPt05,2.5,1,0.400175,0.031091,1287.119803,1.350169,1.38842,97.245032,25.169938,...,0.871567,97.966733,234.232881,57.262434,161.804407,32.012626,83.11339,44.229563,271.906949,36.726919


In [9]:
y=data.statical_data_df['y']

In [11]:
X =data.statical_data_df[["mean_left_swings_time" , "std_left_swings_time", "cv_left_swings_time"
                                                                      , "mean_left_stride_time" , "std_left_stride_time" ,"cv_left_stride_time"
                                                                      , "kurtosis_left_strides_time", "skew_left_strides_time"
                                                                      , "mean_left_stance_time" , "std_left_stance_time", "cv_left_stance_time"
                                                                      , "mean_right_swings_time" , "std_right_swings_time", "cv_right_swings_time"
                                                                      , "mean_right_stride_time" , "std_right_stride_time", "cv_right_stride_time"
                                                                      , "kurtosis_right_strides_time", "skew_right_strides_time"
                                                                      , "mean_right_stance_time" , "std_right_stance_time", "cv_right_stance_time"
                                                                      , "mean_left_peaks_heel" , "std_left_peaks_heel"
                                                                      , "mean_left_peaks_toe" , "std_left_peaks_toe"
                                                                      , "mean_right_peaks_heel" , "std_right_peaks_heel"
                                                                      , "mean_right_peaks_toe" , "std_right_peaks_toe"]]

In [12]:
test_score_RFC=[]

X_train, X_test, y_train1, y_test1 = train_test_split(X, np.ravel(y),random_state=64)
y_train = pd.DataFrame(y_train1)
y_test = pd.DataFrame(y_test1)

for n in [5, 10, 20, 40, 100, 200]:
    clf = RandomForestClassifier(n_estimators = int(n), n_jobs=2)
    clf.fit(X_train, np.ravel(y_train))
    acc = clf.score(X_test, np.ravel(y_test))
    test_score_RFC.append(acc)
     


In [13]:
for neighbor, tr_sc in zip(([5, 10, 20, 40, 100, 200]),test_score_RFC): 
    print(f"Estimator = {neighbor}")
    print('Accuracy of Random Forest Classifier on training set: {:.2f}'.format(tr_sc))

Estimator = 5
Accuracy of Random Forest Classifier on training set: 0.83
Estimator = 10
Accuracy of Random Forest Classifier on training set: 0.79
Estimator = 20
Accuracy of Random Forest Classifier on training set: 0.83
Estimator = 40
Accuracy of Random Forest Classifier on training set: 0.80
Estimator = 100
Accuracy of Random Forest Classifier on training set: 0.82
Estimator = 200
Accuracy of Random Forest Classifier on training set: 0.82


In [14]:
prediction_clf = clf.predict(X_test)
prec = precision_score(y_test1, prediction_clf)
rec = recall_score(y_test1, prediction_clf)
f1 = f1_score(y_test1, prediction_clf)
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1: {}".format(f1))

Precision: 0.8620689655172413
Recall: 0.8928571428571429
F1: 0.8771929824561403


## RF Classifier on statics of the raw sensors data

In [20]:
data.raw_statics_df.head()

Unnamed: 0,ID,level,y,Time_Min,Time_Max,Time_Std,Time_Med,Time_Avg,Time_Skewness,Time_Kurtosis,...,Total_Force_Left_Avg,Total_Force_Left_Skewness,Total_Force_Left_Kurtosis,Total_Force_Right_Min,Total_Force_Right_Max,Total_Force_Right_Std,Total_Force_Right_Med,Total_Force_Right_Avg,Total_Force_Right_Skewness,Total_Force_Right_Kurtosis
0,GaCo01,0.0,0,0.0,480.92,142.772146,20.9,118.283573,0.715433,-1.048771,...,517.728542,-0.062798,-1.823584,0.0,1257.3,490.075715,717.31,552.158582,-0.097883,-1.850094
1,GaPt03,3.0,1,0.0,106.92,28.711902,0.0,24.715777,0.746198,-0.681584,...,334.245344,-0.011145,-1.690073,0.0,784.85,299.651983,377.63,356.892798,-0.000753,-1.693112
2,SiPt02,2.5,1,0.0,246.95,34.643793,0.0,17.166232,2.926951,10.058267,...,462.035754,-0.101435,-1.832651,0.0,979.88,410.276345,539.77,474.159044,-0.066059,-1.815159
3,SiPt04,2.0,1,0.0,188.32,33.281562,0.0,17.040191,2.296395,5.177897,...,353.097941,0.033986,-1.708469,0.0,901.78,354.901744,576.84,397.830886,-0.094729,-1.832501
4,GaPt04,2.5,1,0.0,304.7,100.625905,26.84,89.612593,0.501348,-1.412808,...,460.916353,-0.041533,-1.723508,0.0,1008.7,395.354583,550.33,481.004502,-0.086407,-1.69934


In [21]:
data.raw_statics_df.shape

(304, 129)

In [23]:
y=data.raw_statics_df['y']

In [24]:
colnames=[]
for col in ["Time"	,"L1",	"L2",	"L3"	,"L4"	"L5",	"L6",	"L7"	,"L8",	"R1",	"R2"	,"R3",	"R4",	"R5",	"R6",	"R7",	"R8",	"Total_Force_Left",	"Total_Force_Right"]:
    for x in ["Min", "Max", "Std", "Med", "Avg", "Skewness", "Kurtosis"]:
        colnames.append( col +'_' +  x)

In [26]:
X =data.raw_statics_df[colnames]

In [58]:
test_score_RFC=[]

X_train, X_test, y_train1, y_test1 = train_test_split(X, np.ravel(y),random_state=64)
y_train = pd.DataFrame(y_train1)
y_test = pd.DataFrame(y_test1)

for n in [5, 10, 20, 40, 100, 200]:
    clf = RandomForestClassifier(n_estimators = int(n), n_jobs=2, random_state = 42)
    clf.fit(X_train, np.ravel(y_train))
    acc = clf.score(X_test, np.ravel(y_test))
    test_score_RFC.append(acc)

In [59]:
for neighbor, tr_sc in zip(([5, 10, 20, 40, 100, 200]),test_score_RFC): 
    print(f"Estimator = {neighbor}")
    print('Accuracy of Random Forest Classifier on training set: {:.2f}'.format(tr_sc))

Estimator = 5
Accuracy of Random Forest Classifier on training set: 0.86
Estimator = 10
Accuracy of Random Forest Classifier on training set: 0.86
Estimator = 20
Accuracy of Random Forest Classifier on training set: 0.83
Estimator = 40
Accuracy of Random Forest Classifier on training set: 0.88
Estimator = 100
Accuracy of Random Forest Classifier on training set: 0.92
Estimator = 200
Accuracy of Random Forest Classifier on training set: 0.91


In [43]:
clf = RandomForestClassifier(n_estimators = 100, n_jobs=2,  random_state = 42)
clf.fit(X_train, np.ravel(y_train))    

prediction_clf = clf.predict(X_test)

prec = precision_score(y_test1, prediction_clf)
rec = recall_score(y_test1, prediction_clf)
f1 = f1_score(y_test1, prediction_clf)
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1: {}".format(f1))   

Precision: 0.9166666666666666
Recall: 0.9821428571428571
F1: 0.9482758620689654


In [44]:
accuracy = accuracy_score(y_test, prediction_clf)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9210526315789473


In [45]:
acc = clf.score(X_test, np.ravel(y_test))
print(acc)

0.9210526315789473


In [79]:
# save the model
with open("/content/drive/MyDrive/ITMO-Master's/Thesis/3rd_semester/Models/rf_classifier_raw_filterd_statics", 'wb') as f:
    pickle.dump(clf, f)

In [80]:
# Load the model                                                                                                                                                                                                       

with open("/content/drive/MyDrive/ITMO-Master's/Thesis/3rd_semester/Models/rf_classifier_raw_filterd_statics", 'rb') as f:
    rf = pickle.load(f)