3 Przygotowanie danych do klasyfikacji

In [79]:
import pandas as pd
import string
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
nltk.download('stopwords')
from nltk import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from random import randint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atago\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3.1. Wczytanie danych z pliku

In [3]:
data_file_name = 'data_after_preparation.csv'
data = pd.read_csv(data_file_name)

3.2. Podział danych na uczące i testowe po określonej dacie

In [4]:
X = data['text']
y = data['bitcoin_trend']

In [5]:
data = data.sort_values('Date')
split_index = int(len(data) * 0.8)

split_date = data.iloc[split_index]['Date']
print("Data dzieląca 80% i 20% danych:", split_date)

Data dzieląca 80% i 20% danych: 2019-05-15


In [6]:
train_data = data[data['Date'] < split_date]
test_data = data[data['Date'] >= split_date]

# Podziel dane treningowe na cechy (X_train) i etykiety (y_train)
X_train = train_data['text']
y_train = train_data['bitcoin_trend']

# Podziel dane testowe na cechy (X_test) i etykiety (y_test)
X_test = test_data['text']
y_test = test_data['bitcoin_trend']

In [14]:
# Zamiana X na DataFrame
X_train = pd.DataFrame(X_train, columns=['text'])
X_test = pd.DataFrame(X_test, columns=['text'])

In [15]:
print(f'Type of X_train: {type(X_train)}')
print(f'Type of y_train: {type(y_train)}')
print(f'Type of X_test: {type(X_test)}')
print(f'Type of y_test: {type(y_test)}')

Type of X_train: <class 'pandas.core.frame.DataFrame'>
Type of y_train: <class 'pandas.core.series.Series'>
Type of X_test: <class 'pandas.core.frame.DataFrame'>
Type of y_test: <class 'pandas.core.series.Series'>


3.3. Klasy do przetwarzania danych tekstowych

Usuwanie znaków interpunkcyjnych

In [22]:
def remove_punctuation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned

class RemovePunctuationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_punctuation(x))
        return X_

Tokenizacja

In [23]:
def tokenizer(text):
    lower_text = text.lower()
    tokenized_text = nltk.word_tokenize(lower_text)
    return tokenized_text

class TokenizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: tokenizer(x))
        return X_


Stopwords

In [26]:
stopwords = nltk.corpus.stopwords.words('english')
print(f'Stopwords: {stopwords}')

def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords


class RemoveStopwordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_stopwords(x))
        return X_

Stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

Usuwanie krótkich tokenów

In [27]:
def remove_short_tokens(text):
    without_shorts = [word for word in text if len(word) > 2]
    return without_shorts


class RemoveShortTokensTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_short_tokens(x))
        return X_

Stemmer

In [31]:
def stemming(text):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words


class StemTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: stemming(x))
        return X_

3.4. Pipeline do przetwarzania danych tekstowych (bez klasyfikacji)

In [80]:
#testowy tekst
i = randint(0, len(data['text']))
print(i, '\n', data['text'][i])

3181762   #MarketAlert (Last 24h):
Top 100 price change: -1.1% (avg)
59 of the top 100 coins declining
bitcoin: +1.1% $BTC https://twitter.com/CoinWatcherBot/status/916868321974390784 … $…


In [84]:
text_prep_pipeline = Pipeline(steps=[
    ('remove_punctuation', RemovePunctuationTransformer(text_column='text')),
    ('tokenizer', TokenizeTransformer(text_column='text')),
    ('remove_stop_words', RemoveStopwordsTransformer(text_column='text')),
    ('remove_short_tokens', RemoveShortTokensTransformer(text_column='text')),
    ('stemmer', StemTransformer(text_column='text')),
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('tfidf_vectorizer', TfidfVectorizer(min_df=0.1, max_df=0.8, ngram_range=(1,2))),
])

In [89]:
X_train = text_prep_pipeline.fit_transform(X_train)

In [90]:
X_train.at[3181762,'text']

'MarketAlert Last 24h\nTop 100 price change 11 avg\n59 of the top 100 coins declining\nbitcoin 11 BTC httpstwittercomCoinWatcherBotstatus916868321974390784\xa0… …'

In [78]:
X_train.at[1204853,'text']

'And...immutable in code.\n\nYou prefer the "code" of natural supply. This is code of arbitrary, but limited supply.\n\nCurrency is a game. Bitcoin plays the rules.'

In [63]:
X_train.index

Index([2796052, 2795651, 2795650, 2795649, 2795648, 2795647, 2795646, 2795652,
       2795645, 2795643,
       ...
       1204847, 1204848, 1204849, 1204850, 1204851, 1204852, 1204853, 1204854,
       1204845, 1139618],
      dtype='int64', length=3121628)

A tu zaczyna się klasyfikacja... [od tego momentu nie przeglądałam kodu - G]

Sprawdzenie jakości danych wchodzących do klasyfikatora

In [34]:
X_train.info()

print(data.columns)  # Wyświetlanie nazw kolumn w danych
print(X_train.columns)  # Wyświetlanie nazw kolumn w X_train

# Sprawdzenie, czy kolumna 'text' istnieje w danych
if 'text' in data.columns:
    print("Kolumna 'text' istnieje w danych")
else:
    print("Kolumna 'text' nie istnieje w danych")

# Sprawdzenie, czy kolumna 'text' istnieje w X_train
if 'text' in X_train.columns:
    print("Kolumna 'text' istnieje w X_train")
else:
    print("Kolumna 'text' nie istnieje w X_train")

Index(['Date', 'text', 'bitcoin_price', 'bitcoin_trend'], dtype='object')
Index(['text'], dtype='object')
Kolumna 'text' istnieje w danych
Kolumna 'text' istnieje w X_train


In [None]:
logistic_regression_pipeline.fit_transform(X_train, y_train)
# Parametry do przetestowania w grid search
parameters = {
    'tfidf_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf_vectorizer__min_df': [1, 2, 3],
    'tfidf_vectorizer__max_df': [0.5, 0.75, 1.0],
    'model__C': [0.1, 1, 10]
}

# Utworzenie obiektu GridSearchCV
grid_search = GridSearchCV(estimator=test_pipe, param_grid=parameters, cv=5)

# Dopasowanie modelu grid search do danych
grid_search.fit(X_train['text'], y_train['bitcoin_trend'])

# Najlepsze parametry i wynik
print("Najlepsze parametry: ", grid_search.best_params_)
print("Najlepszy wynik: ", grid_search.best_score_)
nltk.download('punkt')
test_pipe.fit(X_train, y_train)
test_pipe.prediction(X_test)
y_pred = pipe_lg.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print('Recall:{:.2f'.format(recall))
