In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
import glob
import os
import tensorflow_decision_forests as tfdf
import sklearn.metrics as sk
import tensorflow_datasets as tfds
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import os
import random
import seaborn as sns
import matplotlib.pyplot as plt

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [None]:
lie_trial_path = '/Users/frank/Downloads/dataSets/Trial-dataset/Clips/processed_lie/' #802 entries
truth_trial_path = '/Users/frank/Downloads/dataSets/Trial-dataset/Clips/processed_truth/' #695 entries

truth_3d_path = lie_trial_path + 'MU3D/'
lie_3d_path = truth_trial_path + 'MU3D/'

lstOfFeatures = ["gaze_0_x","gaze_0_y","gaze_0_z","gaze_angle_x", "gaze_angle_y","dgaze_0_x", "dgaze_0_y", "dgaze_angle_y", "AU01_r","AU04_r","AU10_r","AU12_r","AU45_r", "pose_Tx", "pose_Ty", "pose_Tz", "pose_Ry", "Result","confidence"]

# display a heatmap
def displayHeatmap(df):
  plt.figure(figsize=(16, 6))
  sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')

# display a confusion matrix
def displayConfusion(actual, predicted):
  sk.ConfusionMatrixDisplay(sk.confusion_matrix(actual, predicted)).plot()
  print("Accuracy is ", round(sk.accuracy_score(actual, predicted) * 100, 2), "%")

# remove all features not in the colList, and drop all datasets with confidence less than 0.9
def column_and_confidence(df, colList = lstOfFeatures):
  currdf = df
  for col in currdf.columns:
    if (str(col) not in colList):
      currdf = currdf.drop(columns = [str(col)])
  
  currdf = currdf.query("confidence >= 0.9")
  currdf = currdf.drop(columns = ["confidence"])
  currdf = currdf.dropna()

  return currdf

# remove all features not in the colList
def keepColumn(df, colList = lstOfFeatures):
  currdf = df
  for col in currdf.columns:
    if (str(col) not in colList):
      currdf = currdf.drop(columns = [str(col)])

  return currdf

# append gaze delta to all data points with confidence more than 0.8
def addGazeDelta(currCSV):
  for j in range(10, currCSV.shape[0]):
      if currCSV.iloc[[j - 10]]["confidence"].iloc[0] >= 0.8:
        currCSV.at[j, 'dgaze_0_x'] = abs(currCSV.at[j - 10, 'gaze_0_x'] - currCSV.at[j, 'gaze_0_x'])
        currCSV.at[j, 'dgaze_0_y'] = abs(currCSV.at[j - 10, 'gaze_0_y'] - currCSV.at[j, 'gaze_0_y'])
        currCSV.at[j, 'dgaze_0_z'] = abs(currCSV.at[j - 10, 'gaze_0_z'] - currCSV.at[j, 'gaze_0_z'])
        currCSV.at[j, 'dgaze_angle_x'] = abs(currCSV.at[j - 10, 'gaze_angle_x'] - currCSV.at[j, 'gaze_angle_x'])
        currCSV.at[j, 'dgaze_angle_y'] = abs(currCSV.at[j - 10, 'gaze_angle_y'] - currCSV.at[j, 'gaze_angle_y'])

  return currCSV

# merge two dataframes without shuffle by default
def mergeTwoDF(df1, df2, shuffle = False):
  df = pd.concat([df1, df2]).reset_index()
  if shuffle:
    df = df.sample(frac=1)
  return df

# Add a result label to the datapoint with true or false (1 if true, 0 if false)
def addLabel(df, TrueOrFalse):
  if TrueOrFalse:
    df["Result"] = 1
  elif not TrueOrFalse:
    df["Result"] = 0

# create a single dataset from a specified path (must be all truth or all lie)
def createDatasetSingle(path, truth):
  df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(path+"*.csv")))).reset_index()
  addGazeDelta(df)
  addLabel(df, truth)
  df = column_and_confidence(df).reset_index().drop(columns = ["index"])

  return df

# create a dual dataset, one with truth dataset and one with false dataset, 
# then shuffle them and merge them into a single dataset
# outputs total dataset, the data X, and a label Y
def createDatasetDual(truthPath, liePath):
  dfT = createDatasetSingle(truthPath, True)
  dfL = createDatasetSingle(liePath, False)

  dfTotal = mergeTwoDF(dfT, dfL, shuffle=True)

  X, Y = dfTotal.drop(columns = ["Result"]), dfTotal["Result"]

  return dfTotal, X, Y

# input a truthpath and a liepath, create a dual dataset and create a train
# test split based on the testRatio
# outputs total train, train with x, train with y, test with x, and test with y
def createDatasetGeneral(truthPath, liePath, testRatio):
  dfT = createDatasetSingle(truthPath, True)
  dfL = createDatasetSingle(liePath, False)

  dfTotal = mergeTwoDF(dfT, dfL, shuffle=True)

  Train, Test = train_test_split(dfTotal, test_size=testRatio, shuffle=False)

  Xtrain, Ytrain = Train.drop(columns = ["Result"]), Train["Result"]

  Xtest, Ytest = Test.drop(columns = ["Result"]), Test["Result"]

  return Train, Xtrain, Ytrain, Xtest, Ytest

# after the model is trained, predict the video output from the path
# use tensorflow if modelName = tf, use sklearn if modelName = sk
# the modelObj is the training model object
# keepList is the list of features used to predict
# prints the possibility of lie and truth in the video
def perdictSingleVideo(path, modelName, modelObj, keepList = lstOfFeatures):

  df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(path+"*.csv")))).reset_index()
  addGazeDelta(df)
  df = column_and_confidence(df).reset_index().drop(columns = ["index"])

  counterLie, counterTrue = 0, 0

  if modelName == "tf":
    res = pd.DataFrame(modelObj.predict(tfdf.keras.pd_dataframe_to_tf_dataset(df)))

  elif modelName == "sk":
    res = modelObj.predict(df)
    temp = res.shape[0]
    res = pd.DataFrame(np.reshape(res, (temp, 1)))

  for i in range(res.shape[0]):
    if res.iloc[i][0] > 0.5:
      counterTrue = counterTrue + 1
    else:
      counterLie = counterLie + 1

  print("Lie Possibility: ", round(counterLie/res.shape[0] * 100, 2), "%")
  print("Truth Possibility: ", round(counterTrue/res.shape[0]* 100, 2), "%")

def preprocessing(folderPath, trueOrFalse, minConfidence = 0.9, numOfFrames = 10):
  csv_files = glob.glob(os.path.join(folderPath, "*.csv"))
  dropped = 0
  processed_files = 0
  data = []
  label = []

  # #perform normalization
  total_csv = []
  for file in csv_files:
    csv_file = pd.read_csv(file)
    csv_file = keepColumn(csv_file)

    if total_csv == []:
      total_csv = np.array(csv_file)
    else:
      # take out frames with confidence less than 0.9
      for i in range(len(csv_file)):
        if csv_file.iloc[i]["confidence"] <= minConfidence:
          total_csv = np.vstack((total_csv, np.array(csv_file.iloc[i])))
  
  max_total = np.amax(total_csv, axis = 0)
  print(max_total)

  for file in csv_files: 
    csv_file = pd.read_csv(file)
    csv_file = keepColumn(csv_file)
    for i in range(csv_file.shape[0]):
      for j in range(csv_file.shape[1]):
        if max_total[j] != 0:
          csv_file.iloc[i].iloc[j] = csv_file.iloc[i].iloc[j] / max_total[j]

    for i in range(numOfFrames, len(csv_file)):
      good_frame = True

      # if any frame has previous frames with confidence below threhold, skip it 
      for j in range(i - numOfFrames, i):
        if csv_file.iloc[j]["confidence"] <= minConfidence:
          good_frame = False
          dropped += 1
          break

      # if it is a good frame, let's process it 
      if not good_frame:
        continue
      
      # append frames and labels to data and label array
      data.append(csv_file.iloc[i - numOfFrames:i])
      label.append(1) if trueOrFalse else label.append(0)

  return data, label


def path_preprocessing(truthFolderPath, lieFolderPath, minConfidence = 0.9, numOfFrames = 10):
  truth_data, truth_label = preprocessing(truthFolderPath, True, minConfidence, numOfFrames)
  lie_data, lie_label = preprocessing(lieFolderPath, False, minConfidence, numOfFrames) 
  
  total_X = truth_data + lie_data
  total_Y = truth_label + lie_label

  random.seed(random.randint(1, 100))
  random.shuffle(total_X)
  random.shuffle(total_Y)


  return np.array(total_X), np.array(total_Y)