<h2>Лемматизация с удалением странных токенов<h2>

<h4>Импортируем библиотеки<h4>

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from collections import Counter
import spacy
from model_selction import model_selection_word_count, model_selection_word_exist, model_selection_tfidf

In [2]:
%ls data

[0m[01;32mp00_tweets.zip[0m*         [01;32mprocessedNeutral.csv[0m*
[01;32mprocessedNegative.csv[0m*  [01;32mprocessedPositive.csv[0m*


<h4>В качестве примера рассмотрим содержимое файла 'processedNegative.csv' после применения метода<h4>

In [3]:
neg_df = pd.read_csv('data/processedNegative.csv').T.reset_index()
neg_text = " ".join([tweet[0] for tweet in neg_df.values.tolist()])
correct_words = words.words()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
neg_lemms = [word.lemma_ for word in nlp(neg_text) if (not word.lemma_ in stopwords.words('english')) and (word.lemma_ in correct_words)]

neg_lemms

['unhappy',
 'dog',
 'like',
 'though',
 'talk',
 'driver',
 'I',
 'say',
 'would',
 'love',
 'go',
 'since',
 'probably',
 'anybody',
 'know',
 'Rand',
 'likely',
 'fall',
 'dollar',
 'I',
 'get',
 'money',
 'I',
 'need',
 'change',
 'r',
 'keep',
 'get',
 'strong',
 'unhappy',
 'I',
 'miss',
 'go',
 'gig',
 'unhappy',
 'new',
 'tonight',
 'unhappy',
 'guy',
 'pop',
 'translator',
 'prob',
 'go',
 'around',
 'Aus',
 'unhappy',
 'chair',
 'sit',
 'I',
 'find',
 'everyone',
 'know',
 'shame',
 'I',
 'pu',
 'like',
 'jittery',
 'caffeine',
 'make',
 'I',
 'sad',
 'area',
 'list',
 'unhappy',
 'think',
 'I',
 'go',
 'anyway',
 'I',
 'want',
 'fun',
 'plan',
 'weekend',
 'unhappy',
 'notice',
 'I',
 'unhappy',
 'recognize',
 'show',
 'bad',
 'track',
 'record',
 'get',
 'cancel',
 'unhappy',
 'dude',
 'go',
 'unhappy',
 'ask',
 'league',
 'check',
 'guy',
 'go',
 'sad',
 'would',
 'go',
 'prison',
 'unhappy',
 'miss',
 'cry',
 'area',
 'depend',
 'promote',
 'waste',
 'team',
 'I',
 'think

<h4>Функция, которая создасть набор данных для обучения моделей<h4>

In [4]:
def lemma_file_to_df(file_name):
    neg_fn, neut_fn, pos_fn = file_name

    neg_df = pd.read_csv(neg_fn).T.reset_index()
    neut_df = pd.read_csv(neut_fn).T.reset_index()
    pos_df = pd.read_csv(pos_fn).T.reset_index()
    
    neg_text = " ".join([tweet[0] for tweet in neg_df.values.tolist()])
    neut_text = " ".join([tweet[0] for tweet in neut_df.values.tolist()])
    pos_text = " ".join([tweet[0] for tweet in pos_df.values.tolist()])

    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
    neg_words = Counter([word.lemma_ for word in nlp(neg_text)
            if (not word.lemma_ in stopwords.words('english')) and (word.lemma_ in correct_words)])
    neut_words = Counter([word.lemma_ for word in nlp(neut_text)
            if (not word.lemma_ in stopwords.words('english')) and (word.lemma_ in correct_words)])
    pos_words = Counter([word.lemma_ for word in nlp(pos_text)
            if (not word.lemma_ in stopwords.words('english')) and (word.lemma_ in correct_words)])
    
    unic_words = list(set(neg_words.keys()) | set(neut_words.keys()) | set(pos_words.keys()))

    neg_exist_index = 0
    neut_exist_index = 1
    pos_exist_index = 2
    neg_count_index = 3
    neut_count_index = 4
    pos_count_index = 5
    word_count_index = 6
    neg_tfidf_index = 7
    neut_tfidf_index = 8
    pos_tfidf_index = 9

    df = np.zeros((len(unic_words), 10))
    for i, word in enumerate(unic_words):
        if word in neg_words.keys():
            df[i,neg_exist_index] = 1
            df[i,neg_count_index] = neg_words[word]
        if word in neut_words.keys():
            df[i,neut_exist_index] = 1
            df[i,neut_count_index] = neut_words[word]
        if word in pos_words.keys():
            df[i,pos_exist_index] = 1
            df[i,pos_count_index] = pos_words[word]

    df[:,word_count_index] = df[:,neg_count_index] + df[:,neut_count_index] + df[:,pos_count_index]
    df[:,neg_tfidf_index] = df[:,neg_count_index] / df[:,word_count_index]
    df[:,neut_tfidf_index] = df[:,neut_count_index] / df[:,word_count_index]
    df[:,pos_tfidf_index] = df[:,pos_count_index] / df[:,word_count_index]

    lemma_df = pd.DataFrame(df, columns=[
        'Negative', 'Neutral', 'Positive',
        'Negative counts', 'Neutral counts', 'Positive counts', 'Word counts',
        'Negative TFIDF', 'Neutral TFIDF', 'Positive TFIDF'])
    lemma_df["word"] = unic_words
    return lemma_df, unic_words

<h4>Узнаем, как называются остальные файлы, содержащие исходный набор данных<h4>

In [5]:
%ls data

[0m[01;32mp00_tweets.zip[0m*         [01;32mprocessedNeutral.csv[0m*
[01;32mprocessedNegative.csv[0m*  [01;32mprocessedPositive.csv[0m*


<h4>Создадим набор данных для обучения<h4>

In [5]:
file_names = ('data/processedNegative.csv', 'data/processedNeutral.csv', 'data/processedPositive.csv')
lemma_df, unic_words = lemma_file_to_df(file_names)
lemma_df

Unnamed: 0,Negative,Neutral,Positive,Negative counts,Neutral counts,Positive counts,Word counts,Negative TFIDF,Neutral TFIDF,Positive TFIDF,word
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.000000,1.000000,0.000000,charm
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.000000,1.000000,0.000000,woo
2,1.0,1.0,0.0,1.0,4.0,0.0,5.0,0.200000,0.800000,0.000000,appear
3,0.0,1.0,0.0,0.0,4.0,0.0,4.0,0.000000,1.000000,0.000000,draft
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.000000,0.000000,0.000000,Levi
...,...,...,...,...,...,...,...,...,...,...,...
3277,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.000000,0.000000,1.000000,effort
3278,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.000000,0.000000,0.000000,script
3279,1.0,1.0,0.0,1.0,6.0,0.0,7.0,0.142857,0.857143,0.000000,accept
3280,1.0,1.0,1.0,3.0,1.0,2.0,6.0,0.500000,0.166667,0.333333,dress


<h4>Узнаем полученную точность модели<h4>

In [6]:
word_exist_accuracy_score = model_selection_word_exist(lemma_df, unic_words)
word_count_accuracy_score = model_selection_word_count(lemma_df, unic_words)
tfidf_accuracy_score = model_selection_tfidf(lemma_df, unic_words)



In [7]:
print(f"""Accuracy score by word exist: {word_exist_accuracy_score}
Accuracy score by word count: {word_count_accuracy_score}
Fccuracy score by tfidf: {tfidf_accuracy_score}""")

Accuracy score by word exist: 0.54337899543379
Accuracy score by word count: 0.91324200913242
Fccuracy score by tfidf: 0.5525114155251142
