<h2>Подготовка набора данных и функций<h2>

<h4>Импортируем библиотки<h4>

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

<h3>Подготовка данных<h3>

<h4>Посмотрим какие файлы находятся в папке "data"<h4>

In [2]:
%ls data

[0m[01;32mp00_tweets.zip[0m*         [01;32mprocessedNeutral.csv[0m*
[01;32mprocessedNegative.csv[0m*  [01;32mprocessedPositive.csv[0m*


<h4>В качестве примера рассмотрим содержимое файла 'processedNegative.csv'<h4>

In [3]:
neg_df = pd.read_csv('data/processedNegative.csv')
neg_df.head(10)

Unnamed: 0,How unhappy some dogs like it though,talking to my over driver about where I'm goinghe said he'd love to go to New York too but since Trump it's probably not,Does anybody know if the Rand's likely to fall against the dollar? I got some money I need to change into R but it keeps getting stronger unhappy,I miss going to gigs in Liverpool unhappy,There isnt a new Riverdale tonight ? unhappy,it's that A*dy guy from pop Asia and then the translator so they'll probs go with them around Aus unhappy,Who's that chair you're sitting in? Is this how I find out. Everyone knows now. You've shamed me in pu,don't like how jittery caffeine makes me sad,My area's not on the list unhappy think I'll go LibDems anyway,I want fun plans this weekend unhappy,...,and yet if parents invest in child's emotional education by taking child out of school on holiday early that's un,YG should have sent them to MCD. I want to see them holding the trophy unhappy anyways .9,i want more orientation unhappy,unhappy they not,YG should have sent them to MCD. I want to see them holding the trophy unhappy anyways .10,wish knock out lang talaga for the new school year are good and cooperative groupmates please unhappy,i miss so much unhappy,Same unhappy .1,Hi instant message your friend friend lang,hindi close friend? unhappy


<h4>Видно, что tweets находятся все в одной строке, что является не удобным<h4>

In [4]:
new_neg_df = neg_df.T
new_neg_df.reset_index(inplace=True)
new_neg_df

Unnamed: 0,index
0,How unhappy some dogs like it though
1,talking to my over driver about where I'm goin...
2,Does anybody know if the Rand's likely to fall...
3,I miss going to gigs in Liverpool unhappy
4,There isnt a new Riverdale tonight ? unhappy
...,...
1112,wish knock out lang talaga for the new school ...
1113,i miss so much unhappy
1114,Same unhappy .1
1115,Hi instant message your friend friend lang


<h4>Посмотрим информацию об объектах набора данных<h4>

In [5]:
new_neg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117 entries, 0 to 1116
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   1117 non-null   object
dtypes: object(1)
memory usage: 8.9+ KB


<h4>Информация показывает, что объектов Nan нет<h4>

<h4>Для дальнейшей работы методы этого проекта предполагают получение предложения. Создадим переменную text с содержанием всех строк набора данных<h4>

In [6]:
text = " ".join([tweet[0] for tweet in new_neg_df.values.tolist()])
text[:300]

"How unhappy  some dogs like it though talking to my over driver about where I'm goinghe said he'd love to go to New York too but since Trump it's probably not Does anybody know if the Rand's likely to fall against the dollar? I got some money  I need to change into R but it keeps getting stronger un"

<h3>Используемые функции<h3>

<h4>В дальнейшем для уменьшения размера текста кода будут использоваться функции, Первый этап включает перечисленную выше обработку данных, обработку данных метода и создание набора для обучения. Рассмотрим этапы создания наборов для обучения.
Будем предполагать, что получаем несколько словарей, где ключом будут слова, а значениями ключей количество этих слов в наборе данных.<h4>

<h4>Создадим словарь<h4>

In [7]:
words = Counter(text.split(' '))
words

Counter({'How': 3,
         'unhappy': 781,
         '': 1367,
         'some': 15,
         'dogs': 4,
         'like': 47,
         'it': 89,
         'though': 6,
         'talking': 12,
         'to': 304,
         'my': 132,
         'over': 14,
         'driver': 9,
         'about': 32,
         'where': 14,
         "I'm": 55,
         'goinghe': 9,
         'said': 14,
         "he'd": 10,
         'love': 38,
         'go': 24,
         'New': 9,
         'York': 9,
         'too': 33,
         'but': 102,
         'since': 22,
         'Trump': 9,
         "it's": 54,
         'probably': 10,
         'not': 57,
         'Does': 2,
         'anybody': 1,
         'know': 13,
         'if': 26,
         'the': 192,
         "Rand's": 1,
         'likely': 1,
         'fall': 3,
         'against': 1,
         'dollar?': 1,
         'I': 281,
         'got': 20,
         'money': 1,
         'need': 17,
         'change': 1,
         'into': 3,
         'R': 1,
         'keeps

<h4>Чтобы заранее создать таблицу, узнаем количество уникальных слов<h4>

In [8]:
unic_words = set(words.keys())
unic_words

{'',
 'plans',
 'okay.',
 'liked',
 "He's",
 'Braam.',
 'absorbed',
 'Man',
 'ones',
 'ask.',
 ')',
 'me.',
 'braces',
 'fixing',
 'dream',
 'WAITED',
 'housewives',
 'students',
 'BDAY',
 'parents',
 'eveand',
 'Those',
 'translator',
 'busy',
 'chopsuey',
 'flavorful',
 '//',
 'oppa!',
 'reminds',
 'Meredith',
 'MORE',
 'dye',
 '!!!!',
 'daianeryoufato',
 'gray',
 "i'm",
 'freebet',
 'loyal',
 'Studio',
 'Could',
 'At',
 'any',
 'salons',
 'valid?',
 'cold.',
 'italian',
 '+I',
 'phonecase',
 'bastard',
 'You',
 'blocked',
 'Trying',
 'that!',
 'mate',
 'haih',
 'Someone',
 'slight',
 'froze',
 'One',
 'names',
 'U2',
 'netherton',
 'Cutest',
 'question',
 'girl!',
 'ending',
 'completed',
 'today..',
 "child's",
 'sangat',
 'faves',
 'with',
 'numbers',
 'Tokyo',
 'goodbye',
 'maghapon',
 '(band)',
 'live',
 'seen',
 'other',
 'PRECIOUS',
 'scenes.',
 "Rand's",
 'strong',
 'fun',
 'sections',
 "They're",
 'enough',
 'isa',
 'tumhari.',
 'applicableg-iinot',
 'as',
 'grabe',
 'lazy',

<h4>В таблице для обучения понадобятся такие поля как:<h4>
<h4>Существование слова в словаре нигативном, нейтральном и позитивном<h4>
<h4>Сколько раз это слово встречается в наборе данных негативном, нейтральном и позитивном и общее количество раз<h4>
<h4>TFIDF для негативного, нейтрального и позитивного наборов<h4>

In [9]:
neg_exist_index = 0
neut_exist_index = 1
pos_exist_index = 2
neg_count_index = 3
neut_count_index = 4
pos_count_index = 5
word_count_index = 6
neg_tfidf_index = 7
neut_tfidf_index = 8
pos_tfidf_index = 9

neg_tokens = words
neut_tokens = dict()
pos_tokens = dict()
df = np.zeros((len(unic_words), 10))
for i, word in enumerate(unic_words):
    if word in neg_tokens.keys():
        df[i,neg_exist_index] = 1
        df[i,neg_count_index] = neg_tokens[word]
    if word in neut_tokens.keys():
        df[i,neut_exist_index] = 1
        df[i,neut_count_index] = neut_tokens[word]
    if word in pos_tokens.keys():
        df[i,pos_exist_index] = 1
        df[i,pos_count_index] = pos_tokens[word]

df[:,word_count_index] = df[:,neg_count_index] + df[:,neut_count_index] + df[:,pos_count_index]
df[:,neg_tfidf_index] = df[:,neg_count_index] / df[:,word_count_index]
df[:,neut_tfidf_index] = df[:,neut_count_index] / df[:,word_count_index]
df[:,pos_tfidf_index] = df[:,pos_count_index] / df[:,word_count_index]

df

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [10]:
metod_df = pd.DataFrame(df, columns=[
    'Negative', 'Neutral', 'Positive',
    'Negative counts', 'Neutral counts', 'Positive counts', 'Word counts',
    'Negative TFIDF', 'Neutral TFIDF', 'Positive TFIDF'])
metod_df["word"] = words
metod_df

Unnamed: 0,Negative,Neutral,Positive,Negative counts,Neutral counts,Positive counts,Word counts,Negative TFIDF,Neutral TFIDF,Positive TFIDF,word
0,1.0,0.0,0.0,1367.0,0.0,0.0,1367.0,1.0,0.0,0.0,How
1,1.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0,0.0,0.0,unhappy
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,
3,1.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0,0.0,0.0,some
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,dogs
...,...,...,...,...,...,...,...,...,...,...,...
3062,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,talaga
3063,1.0,0.0,0.0,4.0,0.0,0.0,4.0,1.0,0.0,0.0,cooperative
3064,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,groupmates
3065,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,hindi


<h4>Функции будут иметь следующий вид:<h4>

In [13]:
def metod_file_to_df(file_name):
    neg_fn, neut_fn, pos_fn = file_name

    neg_df = pd.read_csv(neg_fn).T.reset_index()
    neut_df = pd.read_csv(neut_fn).T.reset_index()
    pos_df = pd.read_csv(pos_fn).T.reset_index()
    
    neg_text = " ".join([tweet[0] for tweet in neg_df.values.tolist()])
    neut_text = " ".join([tweet[0] for tweet in neut_df.values.tolist()])
    pos_text = " ".join([tweet[0] for tweet in pos_df.values.tolist()])

    neg_words = Counter("metod")
    neut_words = Counter("metod")
    pos_words = Counter("metod")
    
    unic_words = list(set(neg_words.keys()) | set(neut_words.keys()) | set(pos_words.keys()))

    neg_exist_index = 0
    neut_exist_index = 1
    pos_exist_index = 2
    neg_count_index = 3
    neut_count_index = 4
    pos_count_index = 5
    word_count_index = 6
    neg_tfidf_index = 7
    neut_tfidf_index = 8
    pos_tfidf_index = 9

    df = np.zeros((len(unic_words), 10))
    for i, word in enumerate(unic_words):
        if word in neg_words.keys():
            df[i,neg_exist_index] = 1
            df[i,neg_count_index] = neg_words[word]
        if word in neut_words.keys():
            df[i,neut_exist_index] = 1
            df[i,neut_count_index] = neut_words[word]
        if word in pos_words.keys():
            df[i,pos_exist_index] = 1
            df[i,pos_count_index] = pos_words[word]

    df[:,word_count_index] = df[:,neg_count_index] + df[:,neut_count_index] + df[:,pos_count_index]
    df[:,neg_tfidf_index] = df[:,neg_count_index] / df[:,word_count_index]
    df[:,neut_tfidf_index] = df[:,neut_count_index] / df[:,word_count_index]
    df[:,pos_tfidf_index] = df[:,pos_count_index] / df[:,word_count_index]

    metod_df = pd.DataFrame(df, columns=[
        'Negative', 'Neutral', 'Positive',
        'Negative counts', 'Neutral counts', 'Positive counts', 'Word counts',
        'Negative TFIDF', 'Neutral TFIDF', 'Positive TFIDF'])
    metod_df["word"] = unic_words
    return metod_df


<h3>Параметры для моделей<h3>

<h4>Преобразуем слова в векторы<h4>

In [11]:
w2v = Word2Vec(sentences=[unic_words], min_count=1)
w2v.wv.vector_size

100

<h4>К сожелению, векторизация слов из набора данных занимает много времени, поэтому в дальнейшем будем использовать множество уникальных слов<h4>

<h4>Создадим входной параметр<h4>

In [12]:
X = np.zeros((len(metod_df.word), w2v.wv.vector_size))
for i, word in enumerate(metod_df.word):
    X[i,:w2v.wv.vector_size] = w2v.wv[word]
X

array([[ 1.14395749e-04, -4.51956381e-04, -9.80680343e-03, ...,
        -3.42480652e-03, -2.13523838e-03, -8.05156026e-03],
       [-4.07405617e-03, -2.87406310e-03, -9.01711232e-04, ...,
         9.88634955e-03, -7.33754411e-03,  7.13394862e-03],
       [ 3.48888640e-03, -2.25079129e-03,  7.54582090e-03, ...,
        -8.02841224e-03, -3.21169826e-03, -6.48772018e-03],
       ...,
       [-9.56930034e-03, -2.91930302e-03, -2.54906132e-03, ...,
        -3.61720286e-03, -3.46823712e-04, -3.11590661e-03],
       [-6.60704263e-03, -3.79024306e-03, -7.24422978e-03, ...,
        -5.63160796e-03, -9.69625078e-03, -4.85439785e-04],
       [ 4.24624095e-03,  1.70569832e-03, -4.52962331e-03, ...,
         4.29505436e-03,  8.74073157e-05, -4.25187312e-03]])

<h4>Создадим выходной параметр<h4>

In [13]:
Y = (-metod_df['Negative'] + metod_df['Positive']) * 0.5 **metod_df['Neutral']
Y

0      -1.0
1      -1.0
2      -1.0
3      -1.0
4      -1.0
       ... 
3062   -1.0
3063   -1.0
3064   -1.0
3065   -1.0
3066   -1.0
Length: 3067, dtype: float64

<h4>Разделим наши праметры на тренировочные и тестовые<h4>

In [14]:
train_X, test_X, train_Y, test_Y = train_test_split(
    X, Y, test_size=0.2, random_state=42)
train_X.shape[0] / X.shape[0], test_X.shape[0] / X.shape[0]

(0.7998043690903163, 0.20019563090968373)

<h4>Найдем модель с наилучшим гиперпараметром<h4>

In [15]:
clf = LogisticRegression(solver='saga')
train_Y[2000] = 0
param_grid = {
    'C': np.arange(1, 5)
}

search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')

search.fit(train_X, train_Y)
search.best_params_

4 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/val/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/val/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1554, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: -1.0



{'C': 1}

<h4>Обучим модель и узнаем её точность на тестовой выборке<h4>

In [16]:
clf = LogisticRegression(C=search.best_params_['C'])
clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
accuracy_score(test_Y, pred_Y)

1.0

<h4>Объединим все действия в одну функцию<h4>

In [None]:
def model_selection_exist_word(metod_df, unic_words):
    
    w2v = Word2Vec(sentences=[unic_words], min_count=1)

    X = np.zeros((len(metod_df.word), w2v.wv.vector_size))
    for i, word in enumerate(metod_df.word):
        X[i,:w2v.wv.vector_size] = w2v.wv[word]
    Y = pd.cut((-metod_df['Negative'] + metod_df['Positive']) * 0.5 **metod_df['Neutral'], bins=[-2, -0.33, 0.33,2], labels=[-1, 0, 1])
    train_X, test_X, train_Y, test_Y = train_test_split(
        X, Y, test_size=0.2, random_state=42)

    clf = LogisticRegression(solver='saga')
    param_grid = {
        'C': np.arange(1, 5)
    }

    search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
    search.fit(train_X, train_Y)
    clf = LogisticRegression(C=search.best_params_['C'])
    clf.fit(train_X, train_Y)
    pred_Y = clf.predict(test_X)
    return accuracy_score(test_Y, pred_Y)

def model_selection_count_word(metod_df, unic_words):
    
    w2v = Word2Vec(sentences=[unic_words], min_count=1)

    X = np.zeros((len(metod_df.word), w2v.wv.vector_size + 3))
    for i, word in enumerate(metod_df.word):
        X[i,:w2v.wv.vector_size] = w2v.wv[word]
    X[:,w2v.wv.vector_size] = metod_df['Negative counts']
    X[:,w2v.wv.vector_size + 1] = metod_df['Neutral counts']
    X[:,w2v.wv.vector_size + 2] = metod_df['Positive counts']
    Y = pd.cut((-metod_df['Negative'] + metod_df['Positive']) * 0.5 **metod_df['Neutral'], bins=[-2, -0.33, 0.33,2], labels=[-1, 0, 1])
    train_X, test_X, train_Y, test_Y = train_test_split(
        X, Y, test_size=0.2, random_state=42)

    clf = LogisticRegression(solver='saga')
    param_grid = {
        'C': np.arange(1, 5)
    }

    search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
    search.fit(train_X, train_Y)
    clf = LogisticRegression(C=search.best_params_['C'])
    clf.fit(train_X, train_Y)
    pred_Y = clf.predict(test_X)
    return accuracy_score(test_Y, pred_Y)

def model_selection_tfidf(metod_df, unic_words):
    
    w2v = Word2Vec(sentences=[unic_words], min_count=1)

    X = np.zeros((len(metod_df.word), w2v.wv.vector_size))
    for i, word in enumerate(metod_df.word):
        X[i,:w2v.wv.vector_size] = w2v.wv[word]
    Y = pd.cut((-metod_df['Negative TFIDF'] + metod_df['Positive TFIDF']) * 0.5 **metod_df['Neutral TFIDF'], bins=[-2, -0.33, 0.33,2], labels=[-1, 0, 1])
    train_X, test_X, train_Y, test_Y = train_test_split(
        X, Y, test_size=0.2, random_state=42)

    clf = LogisticRegression(solver='saga')
    param_grid = {
        'C': np.arange(1, 5)
    }

    search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
    search.fit(train_X, train_Y)
    clf = LogisticRegression(C=search.best_params_['C'])
    clf.fit(train_X, train_Y)
    pred_Y = clf.predict(test_X)
    return accuracy_score(test_Y, pred_Y)