Converting Video to Audio files

In [1]:
from preprocessing import extract_audio_features
import subprocess
import os

def convert_video(filename, directory):
    input_path = os.path.join(os.getcwd(), directory, filename)
    
    filename = filename.removesuffix('.mp4') + '.wav'
    output_path = os.path.join(os.getcwd(), 'audio', filename)

    command = ["ffmpeg", "-i", input_path, "-q:a", "0", "-map", "a", output_path]
    result = subprocess.run(command, capture_output=True, text=True)
    
    if result.returncode != 0:
        print("no", end = '---')

# directory = "videos"
# for i, filename in enumerate(os.listdir(directory)):
#     print(i, end = '-')
#     convert_video(filename, directory)

Extracting Audio features

In [None]:
import os
import pandas as pd
import pickle

directory = "audio"
output_pickle = "features.pkl"

# Initialize the DataFrame with filenames
filenames = sorted(os.listdir(directory)) 
df = pd.DataFrame({"filename": filenames, "audio_features": [None] * len(filenames)})

output_csv = "audio_features.csv"

for i, row in df.iterrows():
    print(i, end = '-')
    filename = row['filename']
    
    path = os.path.join(directory, filename)
    audio_features = extract_audio_features(path)

    df.at[i, "audio_features"] = audio_features
    if i % 10== 0:
        with open(output_pickle, "wb") as f:
            pickle.dump(df, f)
            
with open(output_pickle, "wb") as f:
    pickle.dump(df, f)

Extracting video features (frames)

In [None]:
import os
import pandas as pd
import pickle
from preprocessing import preprocess_video

directory = "videos"
output_pickle = "video_features.pkl"

filenames = sorted(os.listdir(directory)) 
df = pd.DataFrame({"filename": filenames, "video_features": [None] * len(filenames)})

for i, row in df.iterrows():
    filename = row['filename']
    
    path = os.path.join(directory, filename)
    video_features = preprocess_video(path)

    df.at[i, "video_features"] = video_features
    if i % 10== 0:
        print(i, end = '-')
        with open(output_pickle, "wb") as f:
            pickle.dump(df, f)
            
with open(output_pickle, "wb") as f:
    pickle.dump(df, f)

0-10-20-30-40-50-

Text Preprocessing and prediction

In [None]:
import pandas as pd
import fasttext
from preprocessing import preprocess_text

model = fasttext.load_model('text_model.bin')


trans = pd.read_csv('transcriptions.csv')
trans['transcription'].fillna('', inplace=True)
trans['text'] = trans['transcription'].apply(lambda x: " ".join(preprocess_text(x)))
trans['text_sentiment'] = trans['text'].apply(lambda x: model.predict(x)[0][0].removeprefix('__label__'))
trans.to_csv('transcriptions.csv', index=False)

Gender Prediction

In [None]:
import numpy as np
from collections import Counter
import joblib

model = joblib.load('models/gender_model.joblib')

df['gender_predictions'] = None
df['gender'] = None

for idx, row in df.iterrows():
    print(idx, end = '-')
    audio_features = np.array(row['audio_features'])
    
    # Predict for each fragment and store predictions
    predictions = []
    for fragment in audio_features: 
        fragment = fragment.reshape(1, -1) 
        fragment = pd.DataFrame(fragment, columns=model.feature_names_in_) # Fetch feature names from the trained model
        prediction = model.predict(fragment)
        predictions.append(prediction[0]) 
        
    df.at[idx, 'gender_predictions'] = predictions
    try:
        df.at[idx, 'female'] = int(Counter(predictions).most_common(1)[0][0])
    except:
        print("fail", end='...')

General prediction function

In [8]:
import torch 
    
def predict(model, input):
    input_tensor = torch.tensor(input, dtype=torch.float32)
    with torch.no_grad():
        output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1)
    return predicted_class

Audio prediction

In [None]:
import torch
from models import AudioNet

audio_model = AudioNet()  
audio_model.load_state_dict(torch.load("models/audio_model.pth", map_location=torch.device('cpu')))

audio_model.eval()

  audio_model.load_state_dict(torch.load("models/audio_model.pth", map_location=torch.device('cpu')))


AudioNet(
  (conv_block1): Sequential(
    (0): Conv1d(196, 64, kernel_size=(1,), stride=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Dropout(p=0.4, inplace=False)
  )
  (conv_block2): Sequential(
    (0): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.4, inplace=False)
  )
  (fc_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=128, out_features=128, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=128, out_features=8, bias=True)
  )
  (softmax): Softmax(dim=1)
)

In [35]:
import numpy as np
import torch
from collections import Counter

df['audio_predictions'] = None
df['audio_sentiment'] = None

for idx, row in df.iterrows():
    print(idx, end = '-')
    
    audio_features = np.array(row['audio_features'], dtype=np.float32)
    female = np.array(row['female'], dtype=np.float32).reshape(1, -1)
    
    # Predict for each fragment and store predictions
    predictions = []
    for fragment in audio_features: 
        fragment = fragment.reshape(1, -1) 
        fragment = np.hstack([fragment, female]).reshape(1, 196, 1)
        prediction = predict(audio_model, fragment)
        predictions.append(prediction[0]) 
        
    df.at[idx, 'audio_predictions'] = predictions
    try:
        df.at[idx, 'audio_sentiment'] = int(Counter(predictions).most_common(1)[0][0])
    except:
        print("fail", end='...')
        


0-1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37-38-39-40-41-42-43-44-45-46-47-48-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-69-70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86-87-88-89-90-91-92-93-94-95-96-97-98-99-100-101-102-103-104-105-106-107-108-109-110-111-112-113-114-115-116-117-118-119-120-121-122-123-124-125-126-127-128-129-130-131-132-133-134-135-136-137-138-139-140-141-142-143-144-145-146-147-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-176-177-178-179-180-181-182-183-184-185-186-187-fail...188-189-190-191-192-193-194-195-196-197-198-199-200-201-202-203-204-205-206-207-208-209-210-211-212-213-214-215-216-217-218-219-220-221-222-223-224-225-226-227-228-229-230-231-232-233-234-235-236-237-238-239-240-241-242-243-244-245-246-247-248-249-250-251-252-253-254-255-256-257-258-259-260-261-262-263-264-265-266-267-268-269-270-271-272-273-274-275

Video sentiment prediction

In [9]:
import torch
from models import ImageNet

video_model = ImageNet()  
video_model.load_state_dict(torch.load("models/video_model.pth", map_location=torch.device('cpu')))

video_model.eval()

  video_model.load_state_dict(torch.load("models/video_model.pth", map_location=torch.device('cpu')))


ImageNet(
  (conv_block1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Dropout2d(p=0.25, inplace=False)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Dropout2d(p=0.25, inplace=False)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block3): 

In [6]:

import pickle 

import pickle
with open('hatemm-features\hate_video_features.pkl', "rb") as f:
    df = pickle.load(f)

# with open('data\\nonhate_video_features_1.pkl', "rb") as f:
#     df1 = pickle.load(f)
    
# with open('data\\nonhate_video_features_2.pkl', "rb") as f:
#     df2 = pickle.load(f)

In [10]:
import numpy as np
from collections import Counter

df['video_predictions'] = None
df['video_sentiment'] = None

for idx, row in df.iterrows():
    print(idx, end = '-')
    
    video_features = np.array(row['video_features'], dtype=np.float32)
    
    # Predict for each fragment and store predictions
    predictions = []
    for frame in video_features: 
        frame = frame.reshape(1, 1, 48, 48) 
        prediction = predict(video_model, frame)
        
        predictions.append(prediction[0]) 
        
    df.at[idx, 'video_predictions'] = predictions
    try:
        df.at[idx, 'video_sentiment'] = int(Counter(predictions).most_common(1)[0][0])
    except:
        print("fail", end='...')
        
df['video_predictions'] = df['video_predictions'].apply(lambda x:[int(i) for i in x] if x else None)

0-1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37-38-39-40-41-42-43-44-45-46-47-48-49-50-51-52-53-fail...54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-69-70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86-87-88-89-90-91-92-93-94-95-96-97-98-99-100-101-102-103-104-105-106-107-108-109-110-111-112-113-114-115-116-117-118-119-120-121-122-123-124-125-126-127-128-129-130-131-132-133-134-135-136-137-138-139-140-141-142-143-144-145-146-147-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-192-193-194-195-196-197-198-199-200-201-202-203-204-205-206-207-208-209-210-211-212-213-214-fail...215-216-217-218-219-220-221-222-223-224-225-226-227-228-229-230-231-232-233-234-235-236-237-238-239-240-241-242-243-244-245-246-247-248-249-250-251-252-253-254-255-256-257-258-259-260-261-262-263-264-265-266-267-268-269-270-271-272-273-

In [69]:
with open('predictions.pkl', "rb") as f:
    import pickle
    df = pickle.load(f)

In [70]:
audio_emotion_mapping = {
    1: "neutral",
    # 2: "calm", calm is changed to neutral
    2:"neutral",
    3: "happy",
    4: "sad",
    5: "angry",
    6: "fear",
    # 7: "disgust", disgust is changed to anger
    7:"angry",
    8: "surprise",
}

video_emotion_mapping = {
    0: "angry",
    # 1: "disgust",
    1:'angry',
    2: "fear",
    3: "happy",
    4: "neutral",
    5: "sad",
    6: "surprise",
}

text_emotion_mapping = {
    "anger": "angry",
    "fear":"fear",
    "surprise":"surprise",
    "sadness":"sad",
    "joy":"happy",
    "love": "happy",
    "neutral":"neutral",
}

Convert all three sentiments to a single uniform format

In [71]:
df['text_sentiment'] = df['text_sentiment'].map(text_emotion_mapping)
df['video_sentiment'] = df['video_sentiment'].map(video_emotion_mapping)
df['audio_sentiment'] = df['audio_sentiment'].map(audio_emotion_mapping)  

In [72]:
# df = df.dropna(subset=['text_sentiment', 'video_sentiment', 'audio_sentiment'])
df['hate'] = df['filename'].apply(lambda x: 1 if x.startswith('hate') else 0)

In [74]:
import pandas as pd
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res)

df = encode_and_bind(df, 'text_sentiment')
df = encode_and_bind(df, 'video_sentiment')
df = encode_and_bind(df, 'audio_sentiment')

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df[[i for i in df.columns if '_sentiment_' in i]]
y = df['hate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=89)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [87]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.4838709677419355
Precision: 0.41025641025641024
Recall: 0.5274725274725275
F1 Score: 0.46153846153846156


In [84]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='f1', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_rf_model = grid_search.best_estimator_


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END class_weight=None, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;