3 Przygotowanie danych do klasyfikacji

In [1]:
import pandas as pd
import string
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
nltk.download('stopwords')
from nltk import PorterStemmer
from sklearn.pipeline import Pipeline
from random import randint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atago\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3.1. Wczytanie danych z pliku

In [2]:
data_file_name = 'data_after_preparation.csv'
data = pd.read_csv(data_file_name)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3932611 entries, 0 to 3932610
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Date           object 
 1   text           object 
 2   bitcoin_price  float64
 3   bitcoin_trend  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 120.0+ MB


3.2. Podział danych na uczące i testowe po określonej dacie

In [4]:
X = data['text']
y = data['bitcoin_trend']

In [5]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3932611 entries, 0 to 3932610
Series name: bitcoin_trend
Non-Null Count    Dtype
--------------    -----
3932611 non-null  int64
dtypes: int64(1)
memory usage: 30.0 MB


In [6]:
data = data.sort_values('Date', ignore_index=True)

In [7]:
data.head()

Unnamed: 0,Date,text,bitcoin_price,bitcoin_trend
0,2014-09-17,$447.00 at 01:00 UTC [24h Range: $443.57 - $46...,457.334015,-1
1,2014-09-17,BTCe Prices\nLAST: $444.00\n BID: $444.00\n AS...,457.334015,-1
2,2014-09-17,LIVE: Profit = $939.28 (0.55 %). BUY B382.20 @...,457.334015,-1
3,2014-09-17,$452.40 at 17:15 UTC [24h Range: $450.00 - $46...,457.334015,-1
4,2014-09-17,"@KQED, enjoy this little gift! @xeniar sent yo...",457.334015,-1


In [8]:
data.isna().sum()

Date             0
text             0
bitcoin_price    0
bitcoin_trend    0
dtype: int64

In [9]:
split_index = int(len(data) * 0.8)

split_date = data.iloc[split_index]['Date']
print("Data dzieląca 80% i 20% danych:", split_date)

Data dzieląca 80% i 20% danych: 2019-05-15


In [10]:
train_data = data[data['Date'] < split_date]
test_data = data[data['Date'] >= split_date]

# Podziel dane treningowe na cechy (X_train) i etykiety (y_train)
X_train = train_data['text']
y_train = train_data['bitcoin_trend']

# Podziel dane testowe na cechy (X_test) i etykiety (y_test)
X_test = test_data['text']
y_test = test_data['bitcoin_trend']

In [11]:
X_train.info()

<class 'pandas.core.series.Series'>
Index: 3121628 entries, 0 to 3121627
Series name: text
Non-Null Count    Dtype 
--------------    ----- 
3121628 non-null  object
dtypes: object(1)
memory usage: 47.6+ MB


In [12]:
# Zamiana X na DataFrame
X_train = pd.DataFrame(X_train, columns=['text'])
X_test = pd.DataFrame(X_test, columns=['text'])

In [13]:
print(f'Type of X_train: {type(X_train)}')
print(f'Type of y_train: {type(y_train)}')
print(f'Type of X_test: {type(X_test)}')
print(f'Type of y_test: {type(y_test)}')

Type of X_train: <class 'pandas.core.frame.DataFrame'>
Type of y_train: <class 'pandas.core.series.Series'>
Type of X_test: <class 'pandas.core.frame.DataFrame'>
Type of y_test: <class 'pandas.core.series.Series'>


Zapisanie dat do pliku

In [14]:
dates = train_data['Date']

In [15]:
dates.to_csv('dates.csv', index=False)

Zapisanie y do plików

In [16]:
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

3.3. Klasy do przetwarzania danych tekstowych

Usuwanie znaków interpunkcyjnych

In [17]:
def remove_punctuation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned

class RemovePunctuationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_punctuation(x))
        return X_

Tokenizacja

In [18]:
def tokenizer(text):
    lower_text = text.lower()
    tokenized_text = nltk.word_tokenize(lower_text)
    return tokenized_text

class TokenizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: tokenizer(x))
        return X_


Stopwords

In [19]:
stopwords = nltk.corpus.stopwords.words('english')
print(f'Stopwords: {stopwords}')

def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords


class RemoveStopwordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_stopwords(x))
        return X_

Stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

Usuwanie krótkich tokenów

In [20]:
def remove_short_tokens(text):
    without_shorts = [word for word in text if len(word) > 2]
    return without_shorts

class RemoveShortTokensTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_short_tokens(x))
        return X_

Usunięcie linków

In [21]:
def remove_links(text):
    without_shorts = [word for word in text if word[0:4]!='http']
    return without_shorts

class RemoveLinksTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_links(x))
        return X_

Stemmer

In [22]:
def stemming(text):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words


class StemTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: stemming(x))
        return X_

Wektoryzacja

In [23]:
class Return_String_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: ' '.join(x))
        X_ = pd.Series(X_[self.text_column])
        return X_

3.4. Pipeline do przetwarzania danych tekstowych (bez klasyfikacji)

In [24]:
#testowy tekst
i = randint(0, len(data['text']))
print(i, '\n', data['text'][i])

500062 
 $851.00 at 06:45 UTC [24h Range: $805.67 - $851.04 Volume: 11261 BTC]


In [25]:
X_train.isna().sum()

text    0
dtype: int64

In [26]:
text_prep_pipeline = Pipeline(steps=[
    ('remove_punctuation', RemovePunctuationTransformer(text_column='text')),
    ('tokenizer', TokenizeTransformer(text_column='text')),
    ('remove_stop_words', RemoveStopwordsTransformer(text_column='text')),
    ('remove_links', RemoveLinksTransformer(text_column='text')),
    ('remove_short_tokens', RemoveShortTokensTransformer(text_column='text')),
    ('stemmer', StemTransformer(text_column='text')),
    ('return_string', Return_String_Transformer(text_column='text'))
])

In [27]:
X_train = text_prep_pipeline.fit_transform(X_train)
X_train.to_csv('X_train.csv', index=False)

In [28]:
X_test = text_prep_pipeline.fit_transform(X_test)
X_test.to_csv('X_test.csv', index=False)

Info

In [29]:
X_train.info()

<class 'pandas.core.series.Series'>
Index: 3121628 entries, 0 to 3121627
Series name: text
Non-Null Count    Dtype 
--------------    ----- 
3121628 non-null  object
dtypes: object(1)
memory usage: 47.6+ MB


In [34]:
i = randint(0, len(X_train))
print(i, '\n', data['text'][i])
print(i, '\n', X_train[i])

1043144 
 仮想通貨業界:今後5年に起こりうる10のこと(前編) 

http://coinpost.jp/?p=10082 

成熟するまで10年かかるかもね....
この一年、不便になるばかりだし...

#ビットコイン #Bitcoin #BTC #ETH #LTC #リップル #XRP #仮想通貨 #アービトラージ #自動売買 #ツール
1043144 
 仮想通貨業界今後5年に起こりうる10のこと前編 成熟するまで10年かかるかもね この一年、不便になるばかりだし ビットコイン bitcoin btc eth ltc リップル xrp 仮想通貨 アービトラージ 自動売買 ツール


In [35]:
i = randint(len(X_train), (len(X_train) + len(X_test)-1))
print(i, '\n', data['text'][i])
print(i, '\n', X_test[i])

3767788 
 Win 10 #BTC  every 10 minutes through a special script

Link : https://t.co/yfr3tDpXPL

.

.

Buy 🔥🔥🔥: https://t.co/Z8Mde2l5fW
@jaxxcurry @Kdunn95 @ArmagedonMGB @JSprucie @J_Lyn_Brn2Shine
XZT
3767788 
 win btc everi minut special script link buy 🔥🔥🔥 jaxxcurri kdunn95 armagedonmgb jspruci jlynbrn2shin xzt
