In [177]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from typing import Dict, Tuple
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline 
sns.set(style="ticks")

In [6]:
data = pd.read_csv('Tweets.csv', sep=",")
data = data[['airline_sentiment', 'text']]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [121]:
# Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки
vocab_list = data['text'].tolist()
vocab_list[0:10]

['@VirginAmerica What @dhepburn said.',
 "@VirginAmerica plus you've added commercials to the experience... tacky.",
 "@VirginAmerica I didn't today... Must mean I need to take another trip!",
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "@VirginAmerica and it's a really big bad thing about it",
 "@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA",
 '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)',
 '@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP',
 "@virginamerica Well, I didn't…but NOW I DO! :-D",
 "@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."]

In [80]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 15051


In [119]:
#лемматизация
vocab_list1 = []
lemmatizer = WordNetLemmatizer()
for i in range(len(vocab_list)):
    word_list = word_tokenize(vocab_list[i])
    output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    vocab_list1.append(output)
print(vocab_list1[0:10])

['@ VirginAmerica What @ dhepburn said .', "@ VirginAmerica plus you 've added commercial to the experience ... tacky .", "@ VirginAmerica I did n't today ... Must mean I need to take another trip !", "@ VirginAmerica it 's really aggressive to blast obnoxious `` entertainment '' in your guest ' face & amp ; they have little recourse", "@ VirginAmerica and it 's a really big bad thing about it", "@ VirginAmerica seriously would pay $ 30 a flight for seat that did n't have this playing . it 's really the only bad thing about flying VA", '@ VirginAmerica yes , nearly every time I fly VX this “ ear worm ” won ’ t go away : )', '@ VirginAmerica Really missed a prime opportunity for Men Without Hats parody , there . http : //t.co/mWpG7grEZP', "@ virginamerica Well , I didn't…but NOW I DO ! : -D", "@ VirginAmerica it wa amazing , and arrived an hour early . You 're too good to me ."]


In [81]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list1)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков после лемматизации- {}'.format(len(corpusVocab)))

Количество сформированных признаков после лемматизации- 14423


In [118]:
#стемминг
vocab_list2 = []
ps = PorterStemmer() 
for i in range(len(vocab_list)):
    word_list = word_tokenize(vocab_list[i])
    output = ' '.join([ps.stem(w) for w in word_list])
    vocab_list2.append(output)
print(vocab_list2[0:10])

['@ virginamerica what @ dhepburn said .', "@ virginamerica plu you 've ad commerci to the experi ... tacki .", "@ virginamerica I did n't today ... must mean I need to take anoth trip !", "@ virginamerica it 's realli aggress to blast obnoxi `` entertain '' in your guest ' face & amp ; they have littl recours", "@ virginamerica and it 's a realli big bad thing about it", "@ virginamerica serious would pay $ 30 a flight for seat that did n't have thi play . it 's realli the onli bad thing about fli VA", '@ virginamerica ye , nearli everi time I fli VX thi “ ear worm ” won ’ t go away : )', '@ virginamerica realli miss a prime opportun for men without hat parodi , there . http : //t.co/mwpg7grezp', "@ virginamerica well , I didn't…but now I DO ! : -D", "@ virginamerica it wa amaz , and arriv an hour earli . you 're too good to me ."]


In [83]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list2)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков после стемминга- {}'.format(len(corpusVocab)))

Количество сформированных признаков после стемминга- 12575


In [147]:
#удаление стоп-слов
stop_words = set(stopwords.words('english'))
vocab_list3 = []
for i in range(len(vocab_list2)):
    word_list = word_tokenize(vocab_list2[i].lower())
    output = ' '.join([word for word in word_list if word not in stop_words])
    vocab_list3.append(output) 
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list3)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков после стемминга и удаления стоп-слов- {}'.format(len(corpusVocab)))

Количество сформированных признаков после стемминга и удаления стоп-слов- 12526


# Решение задачи анализа тональности

In [150]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['airline_sentiment'], test_size=0.5, random_state=1)

In [171]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t\t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [172]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [176]:
sentiment(CountVectorizer(), LogisticRegression(C=5.0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Метка 		 Accuracy
negative 	 0.8730744196137991
neutral 	 0.5811855670103093
positive 	 0.6591889559965487


In [181]:
sentiment(CountVectorizer(), MultinomialNB())

Метка 		 Accuracy
negative 	 0.9687567802126275
neutral 	 0.3382731958762887
positive 	 0.4823123382226057


In [178]:
sentiment(CountVectorizer(), ComplementNB())

Метка 		 Accuracy
negative 	 0.9264482534172271
neutral 	 0.42783505154639173
positive 	 0.6471095772217429


In [180]:
sentiment(CountVectorizer(), BernoulliNB())

Метка 		 Accuracy
negative 	 0.9681058798003905
neutral 	 0.3627577319587629
positive 	 0.4089732528041415


In [183]:
#Все методы довольно точно определяют негативные комментарии, однако определение нейтральных и позитивных проблематично.
#Однозначно лидирующего метода не выявлено
#Наилучшие результаты показали сочетание CountVectorizer + CNB, а также CountVectorizer + LogisticRegression