## Импорт библиотек

In [1]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl

In [2]:
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import re
import pymorphy2
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Импорт данных

In [3]:
csv_data = pd.read_csv('Data_ASR_2.csv')

In [4]:
def calc_sentiment_label(row):
  if row['sentiment'] > 0:
    return 1
  elif row['sentiment'] < 0:
    return -1
  else:
    return 0

csv_data['sentiment_label'] = csv_data.apply(calc_sentiment_label, axis=1)

In [5]:
csv_data.sample(3)

Unnamed: 0,video,start_time,end_time,sentiment,happy,sad,anger,surprise,disgust,fear,text,ASR,sentiment_label
6956,3wHE78v9zr4,66.586168,81.412472,0.333333,0.0,0.333333,2.0,0.0,0.666667,0.0,We want them to transition their business mod...,we want them to transition their business mod...,1
4667,262165,139.386,141.586,-0.666667,0.333333,0.333333,0.333333,0.0,0.333333,0.0,And I think they just shot his coverage separ...,i think they just shot his coverage separately,-1
2990,225343,88.7855,90.522,-0.666667,0.333333,0.0,0.0,0.0,0.333333,0.0,There's nothing to it,there's nothing to it,-1


In [None]:
!unzip Audio.zip

## Текстовые данные

### Препроцессинг

In [6]:
text_data = csv_data['text'].tolist()
text_data[0]

'I see that a writer is somebody who has an incredible command of mechanics of the English language.'

In [7]:
def preprocess(string):
  result = word_tokenize(string)

  punctiations = list(punctuation)
  result = [i for i in result if (i not in punctiations)]

  result =  [i.lower() for i in result]

  stop_words = nltk.corpus.stopwords.words('english')
  result = [i for i in result if ( i not in stop_words )]

  wnl = WordNetLemmatizer()
  result = [wnl.lemmatize(word, pos="v") for word in result]

  return ' '.join(result)

In [8]:
preprocessed_text_data = [preprocess(string) for string in text_data]
preprocessed_text_data[0]

'see writer somebody incredible command mechanics english language'

### Извлечение признаков

#### Bag of words

In [9]:
vectorizer = CountVectorizer(max_features=10000)
bow_X = vectorizer.fit_transform(np.asarray(preprocessed_text_data)).toarray()

#### TF-IDF

In [11]:
vectorizer = TfidfVectorizer(max_features=10000)
tf_idf_X = vectorizer.fit_transform(np.asarray(preprocessed_text_data)).toarray()

### Подготовка к обучению

In [16]:
def calculate_metrics(y_test, y_pred):
  res = []
  res.append(accuracy_score(y_test, y_pred))
  res.append(f1_score(y_test, y_pred, average='weighted'))

  return res

In [None]:
y = csv_data['sentiment_label'].tolist()

In [14]:
bow_X_train, bow_X_test, bow_y_train, bow_y_test = train_test_split(bow_X, y, test_size=0.2, random_state=42)

In [15]:
tf_idf_X_train, tf_idf_X_test, tf_idf_y_train, tf_idf_y_test = train_test_split(tf_idf_X, y, test_size=0.2, random_state=42)

### Обучение классификатора для BoW

In [17]:
metrics_names = ['Классификатор', 'Accuracy', 'F-score']
bow_results = pd.DataFrame(columns=metrics_names)

In [18]:
k_neigh = KNeighborsClassifier().fit(bow_X_train, bow_y_train)
y_pred = k_neigh.predict(bow_X_test)
bow_results.loc[0] = ['K ближайших соседей'] + calculate_metrics(bow_y_test, y_pred)

### Обучение классификатора для TF-IDF

In [19]:
tf_idf_results = pd.DataFrame(columns=metrics_names)

In [20]:
k_neigh = KNeighborsClassifier().fit(tf_idf_X_train, tf_idf_y_train)
y_pred = k_neigh.predict(tf_idf_X_test)
tf_idf_results.loc[0] = ['K ближайших соседей'] + calculate_metrics(tf_idf_y_test, y_pred)

### Результаты

In [21]:
bow_results

Unnamed: 0,Классификатор,Accuracy,F-score
0,K ближайших соседей,0.471779,0.41143


In [22]:
tf_idf_results

Unnamed: 0,Классификатор,Accuracy,F-score
0,K ближайших соседей,0.465031,0.33317


## Аудио данные

In [64]:
filenames = csv_data[['video', 'start_time', 'end_time']]
filenames.head(3)

Unnamed: 0,video,start_time,end_time
0,--qXJuDtHPw,23.199,30.325
1,-3g5yACwYnA,82.7645,100.555
2,-3g5yACwYnA,119.919,125.299


### Извлечение признаков

#### Cпекрограмма

In [84]:
def calc_spec(row):
  filename = row['video']
  start_time = row['start_time']
  end_time = row['end_time']

  path = '/content/WAV_16000/' + filename + '.wav'
  duration = end_time - start_time
  audio, sr = librosa.load(path, offset=start_time, duration=duration, sr=None)
  return librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)

In [67]:
spec_X = []
for index, row in filenames.iterrows():
  spec_X.append(calc_spec(row))

spec_X[0]

array([[-34.503677, -36.815025, -49.970882, ..., -80.      , -71.728226,
        -70.23065 ],
       [-36.01568 , -35.914936, -40.584583, ..., -66.42352 , -65.89055 ,
        -66.74345 ],
       [-39.717934, -48.00886 , -39.195263, ..., -62.933533, -63.663197,
        -75.9989  ],
       ...,
       [-68.988335, -70.998116, -70.16452 , ..., -80.      , -80.      ,
        -80.      ],
       [-69.47138 , -65.11388 , -65.868484, ..., -80.      , -80.      ,
        -80.      ],
       [-75.390976, -68.05876 , -64.29671 , ..., -80.      , -80.      ,
        -80.      ]], dtype=float32)

#### MFCC

In [83]:
def calc_mfcc(row):
  filename = row['video']
  start_time = row['start_time']
  end_time = row['end_time']

  path = '/content/WAV_16000/' + filename + '.wav'
  duration = end_time - start_time
  audio, sr = librosa.load(path, offset=start_time, duration=duration, sr=None)
  return librosa.feature.mfcc(y=audio, sr=sr)

In [72]:
mfcc_X = []
for index, row in filenames.iterrows():
  mfcc_X.append(calc_mfcc(row))

mfcc_X[0]

array([[-1.20045830e+02, -1.04120605e+02, -1.28849686e+02, ...,
        -3.57933411e+02, -3.43238800e+02, -3.51277069e+02],
       [ 1.32527344e+02,  1.24843773e+02,  1.27610634e+02, ...,
         1.01138458e+02,  1.04319824e+02,  1.07357849e+02],
       [-5.78380775e+01, -5.33083420e+01, -4.42336960e+01, ...,
        -1.25672121e+01, -2.09349585e+00,  8.83938909e-01],
       ...,
       [-3.19155407e+00, -1.74299860e+00, -1.94362533e+00, ...,
         4.65432835e+00,  4.91457367e+00,  5.03732920e+00],
       [-3.77361774e-02, -3.49693203e+00, -2.03262615e+00, ...,
        -3.99641252e+00, -1.13422174e+01, -6.83568001e+00],
       [ 7.09985161e+00,  8.43449402e+00,  5.30378723e+00, ...,
        -5.76650810e+00, -1.07871656e+01, -6.98580456e+00]], dtype=float32)

### Подготовка к обучению

In [73]:
spec_X_train, spec_X_test, spec_y_train, spec_y_test = train_test_split(spec_X, y, test_size=0.2, random_state=42)

In [74]:
mfcc_X_train, mfcc_X_test, mfcc_y_train, mfcc_y_test = train_test_split(mfcc_X, y, test_size=0.2, random_state=42)

### Обучение классификатора для спектрограмм

In [75]:
spec_results = pd.DataFrame(columns=metrics_names)

In [85]:
k_neigh = KNeighborsClassifier().fit(spec_X_train, spec_y_train)
y_pred = k_neigh.predict(spec_X_test)
spec_results.loc[0] = ['K ближайших соседей'] + calculate_metrics(spec_y_test, y_pred)

### Обучение классификатора для MFCC

In [86]:
mfcc_results = pd.DataFrame(columns=metrics_names)

In [88]:
k_neigh = KNeighborsClassifier().fit(mfcc_X_train, mfcc_y_train)
y_pred = k_neigh.predict(mfcc_X_test)
mfcc_results.loc[0] = ['K ближайших соседей'] + calculate_metrics(mfcc_y_test, y_pred)

### Результаты

In [93]:
spec_results

Unnamed: 0,Классификатор,Accuracy,F-score
0,K ближайших соседей,0.39558,0.345852


In [96]:
mfcc_results

Unnamed: 0,Классификатор,Accuracy,F-score
0,K ближайших соседей,0.451012,0.417364
