## Импорт библиотек

In [5]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl 

In [16]:
import os.path
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import re
import pymorphy2
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from string import punctuation
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Импорт данных

In [7]:
csv_data = pd.read_csv('Data_ASR_2.csv')

In [8]:
def calc_sentiment_label(row):
  if row['sentiment'] > 0:
    return 1
  elif row['sentiment'] < 0:
    return 2 # чтобы не было -1 в лейблах
  else:
    return 0

csv_data['sentiment_label'] = csv_data.apply(calc_sentiment_label, axis=1)

In [283]:
y = csv_data['sentiment_label'].tolist()

In [9]:
csv_data.sample(3)

Unnamed: 0,video,start_time,end_time,sentiment,happy,sad,anger,surprise,disgust,fear,text,ASR,sentiment_label
15520,Xq7zLxYHxd8,0.0,3.975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Rhet Rieger White Caspian studios on behalf of...,red ryder white caspian studios on behalf of ...,0
12186,JwzxqrD8tIo,28.816,35.5565,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,"In fact, just when she's get off the treadmill...",in fact just when she gets off the treadmill ...,0
3530,238060,122.6335,131.335,-0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,"Some people gave it a two, a few people were ...",some people gave it to a few people were gene...,2


## Текстовые данные

### Препроцессинг

In [107]:
text_data = csv_data['text'].tolist()
text_data[0]

'I see that a writer is somebody who has an incredible command of mechanics of the English language.'

In [108]:
def preprocess(string):
  result = word_tokenize(string)

  punctiations = list(punctuation)
  result = [i for i in result if (i not in punctiations)]

  result =  [i.lower() for i in result]

  stop_words = nltk.corpus.stopwords.words('english')
  result = [i for i in result if ( i not in stop_words )]

  wnl = WordNetLemmatizer()
  result = [wnl.lemmatize(word, pos="v") for word in result]

  return ' '.join(result)

In [109]:
preprocessed_text_data = [preprocess(string) for string in text_data]
preprocessed_text_data[0]

'see writer somebody incredible command mechanics english language'

### Извлечение признаков

#### TF-IDF

In [282]:
vectorizer = TfidfVectorizer(max_features=10000)
tf_idf_X = vectorizer.fit_transform(np.asarray(preprocessed_text_data)).toarray()

### Подготовка к обучению

In [284]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_X, y, test_size=0.2, random_state=42)

### Нейронная сеть

In [287]:
class TF_IDF_NN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_layers=(128, 64), dropout_rate=0.2):
      super(TF_IDF_NN, self).__init__()
      self.input_dim = input_dim
      self.output_dim = output_dim
      self.hidden_layers = hidden_layers
      self.dropout_rate = dropout_rate

      layers = []
      prev_dim = input_dim
      for units in hidden_layers:
          layers.append(nn.Linear(prev_dim, units))
          layers.append(nn.ReLU())
          layers.append(nn.Dropout(dropout_rate))
          prev_dim = units
      layers.append(nn.Linear(prev_dim, output_dim))
      layers.append(nn.Softmax(dim=1))

      self.model = nn.Sequential(*layers)

    def forward(self, x):
      return self.model(x)

    def train_model(self, X_train, y_train, batch_size=32, epochs=10, lr=0.001):
      criterion = nn.CrossEntropyLoss()
      optimizer = optim.Adam(self.parameters(), lr=lr)

      X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
      y_train_tensor = torch.tensor(y_train, dtype=torch.long)

      train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
      train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

      for epoch in range(epochs):
        self.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
          inputs, labels = data
          optimizer.zero_grad()
          outputs = self(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_loader)}')

    def predict(self, X):
      self.eval()
      with torch.no_grad():
        X_tensor = torch.tensor(X, dtype=torch.float32)
        outputs = self(X_tensor)
        _, predicted = torch.max(outputs, 1)
      return predicted.numpy()

In [288]:
model = TF_IDF_NN(input_dim=X_train.shape[1], output_dim=3)
model.train_model(X_train, y_train, epochs=10)

Epoch 1/10, Loss: 0.972075864826281
Epoch 2/10, Loss: 0.8766106591396725
Epoch 3/10, Loss: 0.8090150558047279
Epoch 4/10, Loss: 0.7613757305948186
Epoch 5/10, Loss: 0.7334210774333206
Epoch 6/10, Loss: 0.7141543867456954
Epoch 7/10, Loss: 0.7021271733893562
Epoch 8/10, Loss: 0.692619305193629
Epoch 9/10, Loss: 0.6851937895787951
Epoch 10/10, Loss: 0.6792588582153583


### Результаты

In [289]:
y_pred = model.predict(X_test)
f_score = f1_score(y_test, y_pred, average='weighted')
print("F1:", f_score)

F1: 0.5675780165738559


## Аудио данные

In [None]:
!unzip WAV_16000.zip

In [254]:
filenames = csv_data[['video', 'start_time', 'end_time']]
filenames.head(3)

Unnamed: 0,video,start_time,end_time
0,--qXJuDtHPw,23.199,30.325
1,-3g5yACwYnA,82.7645,100.555
2,-3g5yACwYnA,119.919,125.299


### Извлечение признаков

#### MFCC

In [201]:
def calculate_mfcc(row, n_mfcc=20, hop_length=512, maxlen=300):
    audio_path = row['video']
    audio_path = '/content/WAV_16000/' + audio_path + '.wav'

    start_time = row['start_time']
    end_time = row['end_time']
    duration = end_time - start_time
    audio, sr = librosa.load(audio_path, offset=start_time, duration=duration, sr=None)

    mfcc_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)

    # приводим к одном длине
    if maxlen and mfcc_features.shape[1] > maxlen:
      mfcc_features = mfcc_features[:, :maxlen]
    elif maxlen and mfcc_features.shape[1] < maxlen:
      mfcc_features = np.pad(mfcc_features, ((0, 0), (0, maxlen - mfcc_features.shape[1]) ))
    return mfcc_features

In [None]:
mfcc_features = []
for index, row in filenames.iterrows():
  mfcc_features.append(calculate_mfcc(row))

### Подготовка к обучению

In [233]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Нейронная сеть

In [206]:
class MFCC_NN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_layers=(128, 64), dropout_rate=0.2):
      super(MFCC_NN, self).__init__()
      self.input_dim = input_dim
      self.output_dim = output_dim
      self.hidden_layers = hidden_layers
      self.dropout_rate = dropout_rate

      layers = []
      prev_dim = input_dim
      for units in hidden_layers:
          layers.append(nn.Linear(prev_dim, units))
          layers.append(nn.ReLU())
          layers.append(nn.Dropout(dropout_rate))
          prev_dim = units
      layers.append(nn.Linear(prev_dim, output_dim))
      layers.append(nn.Softmax(dim=1))

      self.model = nn.Sequential(*layers)
      self.sigmoid = nn.Sigmoid()

    def forward(self, x):
      x = self.model(x)
      x = self.sigmoid(x)
      return x

    def train_model(self, X_train, y_train, batch_size=32, epochs=10, lr=0.001):
      criterion = nn.CrossEntropyLoss()
      optimizer = optim.Adam(self.parameters(), lr=lr)

      X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
      y_train_tensor = torch.tensor(y_train, dtype=torch.long)

      train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
      train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

      for epoch in range(epochs):
        self.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
          inputs, labels = data
          optimizer.zero_grad()
          outputs = self(inputs)
          _, predicted = torch.max(outputs, 1)
          loss = criterion(predicted, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

        print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_loader)}')

    def predict(self, X):
      self.eval()
      with torch.no_grad():
        X_tensor = torch.tensor(X, dtype=torch.float32)
        outputs = self(X_tensor)
        _, predicted = torch.max(outputs, 1)
      return predicted.numpy()

In [280]:
model = MFCC_NN(input_dim=X_train.shape[1], output_dim=3)
model.train_model(X_train, y_train, epochs=10)

Epoch 1/10, Loss: 0.9932601799352975
Epoch 2/10, Loss: 0.9522696561454447
Epoch 3/10, Loss: 0.893428182101837
Epoch 4/10, Loss: 0.8137438933950435
Epoch 5/10, Loss: 0.7990680043614886
Epoch 6/10, Loss: 0.7440440214147467
Epoch 7/10, Loss: 0.7276648837498956
Epoch 8/10, Loss: 0.7193072193753713
Epoch 9/10, Loss: 0.7089996109526046
Epoch 10/10, Loss: 0.7012024551072361


### Результаты

In [253]:
y_pred = model.predict(X_test)
f_score = f1_score(y_test, y_pred, average='weighted')
print("F1:", f_score)

F1: 0.4910878346170084
