In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='UTF-8', encoding_errors='ignore')
data.columns = ['target','id','date','flag','user','text']
data['target'] = data['target'].apply(lambda x: 0 if x==0 else 1)
# groups = data.groupby('target')
# n = 250000

# # Define a function to sample n rows from each group
# def sample_group(group):
#     return group.sample(n=n, replace=True)
# data = groups.apply(sample_group).reset_index(drop=True)

In [4]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [5]:
data.drop(columns=['id','date','flag','user'], inplace=True)
data.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [6]:
import re

def clean_text(text):
    # remove words starting with @ symbol
    text = re.sub(r'@\w+\s?', '', text)
    
    # remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    
    # remove unnecessary whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def clean_data(data):
    # initialize progress bar
    tqdm.pandas()
    
    # apply clean_text function to each sentence in text column
    data['clean_text'] = data['text'].progress_apply(clean_text)
    
    return data

clean_data(data)

100%|██████████| 1599999/1599999 [00:17<00:00, 93119.16it/s] 


Unnamed: 0,target,text,clean_text
0,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
1,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
2,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...
4,0,@Kwesidei not the whole crew,not the whole crew
...,...,...,...
1599994,1,Just woke up. Having no school is the best fee...,just woke up having no school is the best feel...
1599995,1,TheWDB.com - Very cool to hear old Walt interv...,thewdbcom very cool to hear old walt interview...
1599996,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover ask me fo...
1599997,1,Happy 38th Birthday to my boo of alll time!!! ...,happy th birthday to my boo of alll time tupac...


In [7]:
from transformers import AutoTokenizer

def tokenize_data(data, model_name):
    # load pre-trained tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tqdm.pandas()
    # tokenize each sentence in clean_text column
    data['tokens'] = data['clean_text'].progress_apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
    vocab_dict = tokenizer.get_vocab()
    return data, vocab_dict, tokenizer
model_name = 'bert-base-uncased'
tokenized_data, vocab_dict, tokenizer = tokenize_data(data, model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

100%|██████████| 1599999/1599999 [02:56<00:00, 9043.76it/s] 


In [9]:
max_ = 0
for i in data['tokens'].values:
    if len(i) > max_:
        max_ = len(i)
max_

72

In [10]:
import numpy as np

def pad_tokens(data, max_length):
    """
    Pads the tokenized sentences in the 'tokens' column of the input DataFrame to a fixed length.
    
    Args:
        data (pandas.DataFrame): The input DataFrame with the 'tokens' column containing the tokenized sentences.
        max_length (int): The maximum length of the padded sentences.
    
    Returns:
        np.ndarray: A 2D NumPy array of shape (num_sentences, max_length) containing the padded token IDs.
    """
    # Get the tokenized sentences as a list
    tokenized_sentences = data['tokens'].tolist()
    
    # Initialize an empty array to hold the padded token IDs
    padded_tokens = np.zeros((len(tokenized_sentences), max_length))
    
    # Pad each sentence with zeros to the desired length and store in the padded_tokens array
    for i, tokens in enumerate(tokenized_sentences):
        padded_tokens[i, :min(len(tokens), max_length)] = tokens[:max_length]
    
    return padded_tokens
padded_tokens = pad_tokens(data, max_)



In [11]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, SpatialDropout1D, MultiHeadAttention, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

def build_transformer_model(num_heads, feed_forward_dim, num_layers, max_seq_length, vocab_size, embedding_dim):
    # Define the input layer
    inputs = Input(shape=(max_seq_length,), dtype=tf.int32)

    # Define the embedding layer
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length)(inputs)
    embedding_layer = SpatialDropout1D(0.2)(embedding_layer)

    # Define the transformer layers
    transformer_layers = []
    for i in range(num_layers):
        transformer_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=0.2)(embedding_layer, embedding_layer)
        transformer_layer = Dropout(0.2)(transformer_layer)
        transformer_layer = LayerNormalization(epsilon=1e-6)(transformer_layer)
        transformer_layer = Dense(feed_forward_dim, activation='relu')(transformer_layer)
        transformer_layer = Dense(embedding_dim)(transformer_layer)
        transformer_layer = Dropout(0.2)(transformer_layer)
        transformer_layer = LayerNormalization(epsilon=1e-6)(transformer_layer)
        transformer_layers.append(transformer_layer)
    transformer_output = tf.keras.layers.concatenate(transformer_layers)

    # Define the output layer
    output_layer = Dense(1, activation='sigmoid')(transformer_output)

    # Create the model
    model = Model(inputs=inputs, outputs=output_layer)

    # Compile the model
    optimizer = Adam(lr=0.00001)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

X_train,X_val,y_train, y_val = train_test_split(padded_tokens,data['target'], test_size=0.2, random_state=42)

num_heads = 8
feed_forward_dim = 512
num_layers = 4
max_seq_length = max_
vocab_size = len(vocab_dict)
embedding_dim = 64

model = build_transformer_model(num_heads, feed_forward_dim, num_layers, max_seq_length, vocab_size, embedding_dim)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=256, shuffle=True)
model.save('whole_senti_tweet_5_epoch.h5')




In [1]:
def predict_sentiment(model, tokenizer, sentence):
    # Clean and preprocess the input sentence
    clean_sentence = clean_text(sentence)
    tokenized_sentence = tokenizer.encode(clean_sentence, add_special_tokens=True)
    padded_sentence = pad_sequences([tokenized_sentence], maxlen=max_, dtype="long", 
                                     value=0, truncating="post", padding="post")
    # Predict the sentiment label
    predicted_label = model.predict(padded_sentence)
    print()

    sentiment_dict = {0: 'Negative', 1: 'Positive'}
    predicted_sentiment = sentiment_dict[1 if predicted_label[0][0][0] > 0.5 else 0]
    
    return predicted_sentiment

In [22]:
predict_sentiment(model,tokenizer,"I am not fine")




'Negative'