In [189]:
import re
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

In [190]:
def preprocessing(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text_token = word_tokenize(text)

    stop_words = set(stopwords.words("english"))
    text_token = [w for w in text_token if not w in stop_words]

    ps = PorterStemmer()
    text_token = [ps.stem(w) for w in text_token]
    return text_token

In [191]:
with open("tweets_train.txt", "r") as f:
    text = f.readlines()
# text -> slice of lines

In [192]:
corpus = [preprocessing(line) for line in text]
# corpus -> double slice of words preprocessed

In [193]:
emotions = []
texts_for_vectorizer = []

for tokens in corpus:
    emotions.append(tokens[0])
    texts_for_vectorizer.append(" ".join(tokens[1:]))

# corpus -> slice of tweets words preprocessed

In [194]:
vectorizer = CountVectorizer(max_features=500)
X = vectorizer.fit_transform(texts_for_vectorizer)
X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 37333 stored elements and shape (6588, 500)>

In [195]:
features_names = vectorizer.get_feature_names_out()
len(features_names)

500

In [196]:
count_vectorized_df = DataFrame.sparse.from_spmatrix(X, columns=features_names)
print(count_vecotrized_df.iloc[:3,400:403].to_markdown())

|    |   someth |   son |   song |
|---:|---------:|------:|-------:|
|  0 |        0 |     0 |      0 |
|  1 |        0 |     0 |      0 |
|  2 |        0 |     0 |      0 |


In [197]:
row_3 = count_vectorized_df.iloc[3]
row_3[row_3 != 0]

cant    1
deal    1
end     1
find    1
keep    1
like    1
may     1
say     1
talk    1
Name: 3, dtype: Sparse[int64, 0]

In [198]:
count_vectorized_df.sum().sort_values(ascending=False).head(15)

tomorrow    1126
go           733
day          667
night        641
may          533
tonight      501
see          439
time         429
im           422
get          398
today        389
game         382
saturday     379
friday       375
sunday       368
dtype: Sparse[int64, 0]

In [199]:
emotions = pd.Series(emotions)
emotions[emotions == "posit"] = 1
emotions[emotions == "neg"] = -1
emotions[emotions == "neutral"] = 0
count_vectorized_df["label"] = emotions
print(count_vectorized_df.iloc[350:354,499:501].to_markdown())

|     |   your |   label |
|----:|-------:|--------:|
| 350 |      0 |       1 |
| 351 |      1 |      -1 |
| 352 |      0 |       1 |
| 353 |      0 |       0 |
