In [2]:
def get_head_pose(landmarks, image_shape):
    # creates a 3d coordinate system model of a face
    model_points = np.array([
        (0.0, 0.0, 0.0),             # nose tip
        (0.0, -330.0, -65.0),        # chin
        (-225.0, 170.0, -135.0),     # left eye left corner
        (225.0, 170.0, -135.0),      # right eye right corner
        (-150.0, -150.0, -125.0),    # left Mouth corner
        (150.0, -150.0, -125.0)      # right mouth corner
    ])
    
    # using media pipe face gets the point corresponding to the model face
    image_points = np.array([
        (landmarks[1].x * image_shape[1], landmarks[1].y * image_shape[0]),  # nose tip
        (landmarks[152].x * image_shape[1], landmarks[152].y * image_shape[0]),  # chin
        (landmarks[226].x * image_shape[1], landmarks[226].y * image_shape[0]),  # left eye left corner
        (landmarks[446].x * image_shape[1], landmarks[446].y * image_shape[0]),  # right eye right corner
        (landmarks[57].x * image_shape[1], landmarks[57].y * image_shape[0]),  # left Mouth corner
        (landmarks[287].x * image_shape[1], landmarks[287].y * image_shape[0])  # right mouth corner
    ], dtype="double")

    # calculates the camera matrix
    principal_point = (image_shape[1]/2, image_shape[0]/2)
    focal_length = principal_point[0] / np.tan(60 / 2 * np.pi / 180)
    camera_matrix = np.array(
        [[focal_length, 0, principal_point[0]],
         [0, focal_length, principal_point[1]],
         [0, 0, 1]], dtype = "double"
    )

    # assuming no lens distortion
    distCoeffs = np.zeros((4,1))

    # SolvePnP
    (success, rotation_vector, translation_vector) = cv2.solvePnP(model_points, image_points, camera_matrix, distCoeffs, flags=cv2.SOLVEPNP_ITERATIVE)

    return rotation_vector, translation_vector

In [3]:
def rotation_vector_to_euler_angles(rotation_vector):
    # converts a rotation vector to a rotation matrix
    rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
    # append a zero column to make it a 3x4 matrix
    projection_matrix = np.hstack((rotation_matrix, np.zeros((3, 1))))
    # returns three-element vector containing three Euler angles of rotation in degrees.
    euler_angles = cv2.decomposeProjectionMatrix(projection_matrix)[6]
    pitch, yaw, roll = [angle for angle in euler_angles]
    
    return pitch, yaw, roll

In [4]:
def is_facing_camera(pitch, yaw, pitch_threshold=160, yaw_threshold=25):
    if abs(pitch) > pitch_threshold and abs(yaw) < yaw_threshold:
        return True
    return False

In [5]:
emotion_history = []
engagement_history = []

def update_history(emotion, engaged, history_length=10):
    emotion_history.append(emotion)
    engagement_history.append(engaged)
    if len(emotion_history) > history_length:
        emotion_history.pop(0)
        engagement_history.pop(0)

def analyze_interest():
    if all(not e for e in engagement_history[-3:]) and 'sad' in emotion_history[-3:]:
        return "It seems like you're not very interested. Would you like to talk about something else?"
    return None

In [8]:
def ask_chatgpt(question):
    """
    Send a question to ChatGPT and return the response.
    """
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": question},
        ]
    )
    return response.choices[0].message.content

In [9]:
from tensorflow.keras.models import load_model
model = load_model('model_all.keras')

In [10]:
def process_speech(frames):
    # Process the speech frames
    recognizer = sr.Recognizer()
    
    # Create an instance of AudioData
    audio_data = sr.AudioData(frames, RATE, audio_interface.get_sample_size(FORMAT))
    
    try:
        # Recognize speech using the AudioData instance
        text = recognizer.recognize_google(audio_data)
        print(f"Recognized text: {text}")
    except sr.UnknownValueError:
        # print("Speech was not understood.")
        pass
    except sr.RequestError as e:
        print(f"Error from speech recognition service: {e}")

In [12]:
import cv2
import mediapipe as mp
import numpy as np
import openai
import sounddevice as sd
import soundfile as sf
import speech_recognition as sr
import openai
import pyaudio
import webrtcvad
import speech_recognition as sr

# FORMAT = pyaudio.paInt16
# CHANNELS = 1
# RATE = 16000
# CHUNK = 320  # This corresponds to 20 ms of audio at 16000 Hz

# vad = webrtcvad.Vad(1)  # Moderate aggressiveness
# audio_interface = pyaudio.PyAudio()

openai.api_key = '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'

# Initialize MediaPipe Face Detection.
mp_face_detection = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)
emotions_dict = {
    0: "angry",
    1: "disgust",
    2: "fear",
    3: "happy",
    4: "neutral",
    5: "sad",
    6: "surprised"
}

mp_face_mesh = mp.solutions.face_mesh
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Initialize Video Capture and Video Writer.
cap = cv2.VideoCapture(1)
# fourcc = cv2.VideoWriter_fourcc(*'XVID')
# out = cv2.VideoWriter('output.avi', fourcc, 20.0, (640, 480))

# stream = audio_interface.open(format=FORMAT, channels=CHANNELS,
#                               rate=RATE, input=True,
#                               frames_per_buffer=CHUNK)
# print("Listening...")

speech_frames = bytes()  # buffer to collect frames identified as speech

if not cap.isOpened():
    raise IOError("Cannot open webcam")

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Convert the frame to RGB before processing.
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        detection_results = face_detection.process(frame_rgb)
        mesh_results = face_mesh.process(frame_rgb)

        if detection_results.detections:
            for detection in detection_results.detections:
                # Get the bounding box.
                bboxC = detection.location_data.relative_bounding_box
                ih, iw, ic = frame.shape
                x, y, w, h = int(bboxC.xmin * iw), int(bboxC.ymin * ih), int(bboxC.width * iw), int(bboxC.height * ih)

                x, y = max(0, x), max(0, y)
                w, h = min(iw - x, w), min(ih - y, h)

                if w > 0 and h > 0:
                    face_roi = frame[y:y+h, x:x+w]
                    if face_roi.size == 0:
                        continue  # Skip if the ROI is empty

                # Extract the face ROI.
                face_roi = frame[y:y+h, x:x+w]
            
                # In case you want to save or display the face ROI:
                # cv2.imwrite('face.jpg', face_roi)
                # cv2.imshow('Face ROI', face_roi)
                
                gray_frame = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY)
                
                # Draw the rectangle around the face on the frame.
                cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

                # Optionally: Use `mp_drawing.draw_detection` to draw the detection.
                # This will draw the full detection box, landmarks, and classification score.
                # mp_drawing.draw_detection(frame, detection)

                 # Resize the frame to match the input size of the model (e.g., 48x48 for FER-2013)
                resized_frame = cv2.resize(gray_frame, (48, 48))
                
                # Normalize the frame
                normalized_frame = resized_frame.astype('float32') / 255.0
                
                # Expand dimensions to match the model's input format (batch_size, height, width, channels)
                # Assuming the model expects a 4D input
                expanded_frame = np.expand_dims(normalized_frame, axis=0)
                expanded_frame = np.expand_dims(expanded_frame, axis=-1)
                
                # Predict the emotion on the expanded frame
                prediction = model.predict(expanded_frame)
                # Assuming your model returns a list of predictions
                emotion_label = np.argmax(prediction)
                
                # Display the resulting frame with detected emotion
                cv2.putText(frame, f'Emotion: {emotions_dict[emotion_label]}', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
                
         # draw the face mesh mapping onto the frame
        if mesh_results.multi_face_landmarks:
            for face_landmarks in mesh_results.multi_face_landmarks:
                mp_drawing.draw_landmarks(
                    image=frame,
                    landmark_list=face_landmarks,
                    connections=mp_face_mesh.FACEMESH_CONTOURS,
                    landmark_drawing_spec=drawing_spec,
                    connection_drawing_spec=drawing_spec)
                
                 # get rotation vector
                rotation_vector, translation_vector = get_head_pose(face_landmarks.landmark, frame.shape)
                # get pitch and yaw
                pitch, yaw, _ = rotation_vector_to_euler_angles(rotation_vector)
                # check if user is facing the camera
                if is_facing_camera(pitch, yaw):
                    cv2.putText(frame, "true", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                else:
                    cv2.putText(frame, "false", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        # audio_frame = stream.read(CHUNK, exception_on_overflow=False)  # Read a frame from the stream
        # is_speech = vad.is_speech(audio_frame, RATE)  # Check if the frame contains speech

        # if is_speech:
        #     speech_frames += audio_frame  # Accumulate speech frames

        # elif speech_frames:
        #     # When non-speech detected and there are accumulated speech frames
        #     process_speech(speech_frames)
        #     speech_frames = bytes()  # Reset speech frame buffer
            
        cv2.imshow('Frame', frame)

        # # Write the frame to the output file.
        # out.write(frame)


except KeyboardInterrupt:
    print("Stream stopped")

finally:
    # Release resources.
    cap.release()
    # out.release()
    cv2.destroyAllWindows()
    # stream.stop_stream()
    # stream.close()
    # audio_interface.terminate()

Stream stopped
