In [1]:
import pandas as pd

# Define column names for the dataset
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Load the dataset into a pandas DataFrame
dataset_path = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'  # Replace this with the actual path to your dataset file
df = pd.read_csv(dataset_path, encoding='latin-1', header=None, names=columns)

# Display the first few rows of the DataFrame
print(df.head())

   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [2]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Preprocess the data
X = df['text']
y = df['target'].replace({4: 1})  # Convert polarity 4 to 1 for positive sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
max_words = 10000  # Maximum number of words to tokenize
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

2024-05-19 13:20:30.711775: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-19 13:20:30.711908: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-19 13:20:30.881939: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Pad sequences to ensure uniform length for input to the model
max_sequence_length = 100  # Maximum length of sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Define different filter sizes
filter_sizes = [ 5,7,10]

# Define embedding size
embedding_dim = 100  # You can experiment with different embedding dimensions


In [4]:
# Record results for different architectures
results = {}

for filter_size in filter_sizes:
    # Define model architecture
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length),
        Conv1D(128, kernel_size=filter_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train_pad, y_train, epochs=2, batch_size=64, validation_split=0.1, verbose=1)
    
    # Evaluate the model
    loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
    results[filter_size] = accuracy
    print(f'Filter size {filter_size}: Test Accuracy = {accuracy}')

# Print results
print("\nResults:")
for filter_size, accuracy in results.items():
    print(f'Filter size {filter_size}: Test Accuracy = {accuracy}')



Epoch 1/2
[1m18000/18000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m629s[0m 35ms/step - accuracy: 0.7889 - loss: 0.4479 - val_accuracy: 0.8210 - val_loss: 0.3964
Epoch 2/2
[1m18000/18000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m625s[0m 35ms/step - accuracy: 0.8315 - loss: 0.3786 - val_accuracy: 0.8229 - val_loss: 0.3900
Filter size 5: Test Accuracy = 0.8232687711715698
Epoch 1/2
[1m18000/18000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m787s[0m 44ms/step - accuracy: 0.7897 - loss: 0.4481 - val_accuracy: 0.8209 - val_loss: 0.3957
Epoch 2/2
[1m18000/18000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m793s[0m 44ms/step - accuracy: 0.8306 - loss: 0.3796 - val_accuracy: 0.8228 - val_loss: 0.3927
Filter size 7: Test Accuracy = 0.8223656415939331
Epoch 1/2
[1m18000/18000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1039s[0m 58ms/step - accuracy: 0.7892 - loss: 0.4491 - val_accuracy: 0.8194 - val_loss: 0.3991
Epoch 2/2
[1m18000/18000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [5]:
import numpy as np

# Select 10 random indices from the test dataset
random_indices = np.random.choice(len(X_test), size=10, replace=False)

# Iterate over the selected indices
for idx in random_indices:
    # Get the text and true label for the selected index
    text = X_test.iloc[idx]
    true_label = y_test.iloc[idx]
    
    # Preprocess the text (tokenization and padding)
    text_seq = tokenizer.texts_to_sequences([text])
    text_pad = pad_sequences(text_seq, maxlen=max_sequence_length)
    
    # Predict the sentiment label for the text
    predicted_label = model.predict(text_pad)[0][0]
    predicted_label = round(predicted_label)  # Round to 0 or 1
    
    # Print the text, predicted label, and true label
    print(f'Text: {text}')
    print(f'Predicted Label: {predicted_label}, True Label: {true_label}')
    print('-----------------------------------------')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Text: Up was such a good movie! I would've cried 3 times but @Jdiamondisme started sobbing and it made me laugh hahaha 
Predicted Label: 1, True Label: 1
-----------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Text: It's officialy 4:21 AM and I can't sleep ! ! 
Predicted Label: 0, True Label: 0
-----------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Text: Ohhh I hope you feel better   I appreciate you taking a look for me.  It's special to me and something I would really like to do
Predicted Label: 1, True Label: 0
-----------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Text: BlueVoda &amp; VodaHost Features: http://bit.ly/cM16p  Check it out! 
Predicted Label: 1, True Label: 1
-----------------------------------------
[1m1/1[0m [32m━━━━━━━━