In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz, csr_matrix # used for saving and loading sparse matrices
import os
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [6]:
os.mkdir('data_preprocessors')
os.mkdir("vectorized_data")

In [25]:
data = pd.read_csv("./clean_tweet_data.csv")
train_set = data[:int(data.shape[0]/2)]
train_set.head()

Unnamed: 0,created_at,source,original_text,clean_text,sentiment,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place,place_coord_boundaries
0,2022-04-22 22:17:05+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @NorthstarCharts: The 10-year yield is tell...,RT the year yield be tell we that there be a h...,1,0.16,0.54,en,188,43,davideiacovozzi,18,55,False,"gold, silver, crypto",NorthstarCharts,,
1,2022-04-22 13:44:53+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @MichaelAArouet: German 10y mortgage rate w...,RT German mortgage rate go from to can you hea...,1,0.15,0.175,en,179,32,davideiacovozzi,18,55,False,,MichaelAArouet,,
2,2022-04-22 06:10:34+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @goldseek: When? https://t.co/kO2FfHKaZg,RT when,-1,0.0,0.0,en,193,26,davideiacovozzi,18,55,False,,goldseek,,
3,2022-04-21 17:22:09+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @charliebilello: The 30-year mortgage rate ...,RT the year mortgage rate in the US rise to it...,1,0.0,0.183333,en,620,213,davideiacovozzi,18,55,False,,charliebilello,,
4,2022-04-21 10:32:26+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @biancoresearch: Rates rise until something...,RT rate rise until something break be anything...,-1,-0.4,0.4,en,1787,417,davideiacovozzi,18,55,False,,biancoresearch,,


In [27]:
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(train_set['clean_text'].values)

CountVectorizer()

In [29]:
dump(unigram_vectorizer, 'data_preprocessors/unigram_vectorizer.joblib')

['data_preprocessors/unigram_vectorizer.joblib']

In [30]:
X_train_unigram = unigram_vectorizer.transform(train_set['clean_text'].values)
save_npz('vectorized_data/X_train_unigram.npz', X_train_unigram)

In [31]:
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)

TfidfTransformer()

In [32]:
dump(unigram_tf_idf_transformer, 'data_preprocessors/unigram_tf_idf_transformer.joblib')

['data_preprocessors/unigram_tf_idf_transformer.joblib']

In [33]:
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)
save_npz('vectorized_data/X_train_unigram_tf_idf.npz', X_train_unigram_tf_idf)

In [35]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(train_set['clean_text'].values)

CountVectorizer(ngram_range=(1, 2))

In [36]:
dump(bigram_vectorizer, 'data_preprocessors/bigram_vectorizer.joblib')

['data_preprocessors/bigram_vectorizer.joblib']

In [37]:
X_train_bigram = bigram_vectorizer.transform(train_set['clean_text'].values)
save_npz('vectorized_data/X_train_bigram.npz', X_train_bigram)

In [38]:
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

TfidfTransformer()

In [39]:
dump(bigram_tf_idf_transformer, 'data_preprocessors/bigram_tf_idf_transformer.joblib')

['data_preprocessors/bigram_tf_idf_transformer.joblib']

In [40]:
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
save_npz('vectorized_data/X_train_bigram_tf_idf.npz', X_train_bigram_tf_idf)

In [42]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

In [43]:
y_train = train_set['sentiment'].values

In [44]:
train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 1.0 ; Validation score: 0.84

Unigram Tf-Idf
Train score: 0.99 ; Validation score: 0.82

Bigram Counts
Train score: 1.0 ; Validation score: 0.78

Bigram Tf-Idf
Train score: 1.0 ; Validation score: 0.78

