In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from textaugment import EDA
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from keras_tuner.tuners import RandomSearch
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import joblib

In [2]:
tweet = pd.read_csv("Tweets.csv")
tweet

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [3]:
# dropping unusefull columns
tweet = tweet.drop(columns=['textID',"text"], axis=1)
tweet.shape

(27481, 2)

In [4]:
tweet = tweet.dropna()
tweet.shape

(27480, 2)

In [5]:
tweet = tweet.loc[(tweet['sentiment'] == 'positive') | (tweet['sentiment'] == 'negative')]
tweet

Unnamed: 0,selected_text,sentiment
1,Sooo SAD,negative
2,bullying me,negative
3,leave me alone,negative
4,"Sons of ****,",negative
6,fun,positive
...,...,...
27475,enjoy,positive
27476,d lost,negative
27477,", don`t force",negative
27478,Yay good for both of you.,positive


In [6]:
tweet.replace({'sentiment':{'negative':0, 'positive':1}}, inplace=True)
tweet

Unnamed: 0,selected_text,sentiment
1,Sooo SAD,0
2,bullying me,0
3,leave me alone,0
4,"Sons of ****,",0
6,fun,1
...,...,...
27475,enjoy,1
27476,d lost,0
27477,", don`t force",0
27478,Yay good for both of you.,1


In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\martj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
verification=tweet["sentiment"]
tweet['selected_text']=tweet['selected_text'].str.lower()
tweet.head(10)
tweet.shape[0]

16363

In [9]:
augmented_dfs = []
eda = EDA()
for i in range (tweet.shape[0]):
    augmented_text = eda.synonym_replacement(tweet['selected_text'].iloc[i])
    augmented_df = pd.DataFrame({'selected_text': [augmented_text], 'sentiment': tweet['sentiment'].iloc[i]})
    augmented_dfs.append(augmented_df)
tweet = pd.concat([tweet]+ augmented_dfs, ignore_index=True)
tweet

Unnamed: 0,selected_text,sentiment
0,sooo sad,0
1,bullying me,0
2,leave me alone,0
3,"sons of ****,",0
4,fun,1
...,...,...
32721,bask,1
32722,d at sea,0
32723,", don`t drive",0
32724,yay full for both of you.,1


In [10]:
text_cleaning_regex = "@S+|https?:S+|http?:S|[^A-Za-z0-9]+"
stop_words = set(stopwords.words('english'))
def clean_tweets(text, stem=False):
    text = re.sub(text_cleaning_regex, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
tweet['selected_text'] = tweet['selected_text'].apply(lambda x: clean_tweets(x))
train_data, test_data = train_test_split(tweet, test_size=0.2, random_state=16)
print("Train Data size:", len(train_data))
print("Test Data size", len(test_data))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['selected_text'])
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size:", vocab_size)
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['selected_text']), maxlen=30)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['selected_text']), maxlen=30)
encoder = LabelEncoder()
encoder.fit(train_data['sentiment'].to_list())
y_train = encoder.transform(train_data['sentiment'].to_list())
y_test = encoder.transform(test_data['sentiment'].to_list())
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

Train Data size: 26180
Test Data size 6546
Vocabulary Size: 8881


In [12]:
def build_model(hp):
    max_features = 1000
    embed_dim = 200

    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, input_length=X_train.shape[1]))
    model.add(SpatialDropout1D(0.3))

    lstm_units = hp.Int('lstm_units', min_value=50, max_value=200, step=25)
    model.add(LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5
)

tuner.search(X_train, y_train, epochs=10, validation_data=(X_train, y_train))
best_model = tuner.get_best_models()[0]
best_model.summary()

Trial 5 Complete [00h 04m 59s]
val_loss: 0.08286481350660324

Best val_loss So Far: 0.07980407029390335
Total elapsed time: 00h 48m 17s
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 200)           1776200   
                                                                 
 spatial_dropout1d (Spatial  (None, 30, 200)           0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               120400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 1896701 (7.24 MB)
Trainable params: 1896701 (7.24 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [13]:
batch_size = 100
best_model.fit(X_train, y_train, epochs = 5, verbose = 1, shuffle=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x262e0797af0>

In [14]:
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)

print("Test Loss: {:.6f}".format(test_loss))
print("Test Accuracy: {:.6f}".format(test_accuracy))

Test Loss: 0.296796
Test Accuracy: 0.914604


In [15]:
def predict_sentiment(model, tokenizer, max_length, phrase, bound=0.9):
    sequence = tokenizer.texts_to_sequences([phrase])
    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    prediction = model.predict(padded_sequence)
    if prediction >= bound:
        sentiment_label = "positive"
    elif prediction <= (1 - bound):
        sentiment_label = "negative"
    else:
        sentiment_label = "neutral"

    return sentiment_label, prediction
phrase_a_predire = "Hello you"
label, score = predict_sentiment(best_model, tokenizer, 30, phrase_a_predire)

print(f"Phrase: {phrase_a_predire}")
print(f"Sentiment prédit: {label}")
print(f"Score de confiance: {score[0][0]}")

Phrase: Hello you
Sentiment prédit: positive
Score de confiance: 0.9967989325523376


In [16]:
# Save the best model to a file
joblib.dump(best_model, 'best_model_TweetSentimentPrediction.pkl')

['best_model_TweetSentimentPrediction.pkl']