In [1]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train_tweet.csv')
test_df = pd.read_csv('test_tweets.csv')

# Display the first few rows of each dataset
train_df.head()
test_df.head()


Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [2]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    # Remove user mentions, URLs, and special characters
    text = re.sub(r'@[\w]*', '', text)  # remove @user
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only letters
    text = text.lower()  # convert to lowercase
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply cleaning function to the tweet column in both datasets
train_df['cleaned_tweet'] = train_df['tweet'].apply(clean_text)
test_df['cleaned_tweet'] = test_df['tweet'].apply(clean_text)

# Display cleaned tweets
train_df[['tweet', 'cleaned_tweet']].head(), test_df[['tweet', 'cleaned_tweet']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(                                               tweet  \
 0   @user when a father is dysfunctional and is s...   
 1  @user @user thanks for #lyft credit i can't us...   
 2                                bihday your majesty   
 3  #model   i love u take with u all the time in ...   
 4             factsguide: society now    #motivation   
 
                                        cleaned_tweet  
 0  father dysfunctional selfish drags kids dysfun...  
 1  thanks lyft credit cant use cause dont offer w...  
 2                                     bihday majesty  
 3                        model love u take u time ur  
 4                      factsguide society motivation  ,
                                                tweet  \
 0  #studiolife #aislife #requires #passion #dedic...   
 1   @user #white #supremacists want everyone to s...   
 2  safe ways to heal your #acne!!    #altwaystohe...   
 3  is the hp and the cursed child book up for res...   
 4    3rd #bihday to my amazing, h

In [3]:
# Define a basic list of common stop words as a substitute for nltk's stopwords
basic_stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', 
    "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
    'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 
    'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
    'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 
    'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', 
    "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', 
    "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
    'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
}

# Update the cleaning function to use the basic_stop_words
def clean_text_basic(text):
    text = re.sub(r'@[\w]*', '', text)  # remove @user mentions
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only letters
    text = text.lower()  # convert to lowercase
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in basic_stop_words)
    return text

# Apply the updated cleaning function to the tweet column in both datasets
train_df['cleaned_tweet'] = train_df['tweet'].apply(clean_text_basic)
test_df['cleaned_tweet'] = test_df['tweet'].apply(clean_text_basic)

# Display cleaned tweets
train_df[['tweet', 'cleaned_tweet']].head(), test_df[['tweet', 'cleaned_tweet']].head()


(                                               tweet  \
 0   @user when a father is dysfunctional and is s...   
 1  @user @user thanks for #lyft credit i can't us...   
 2                                bihday your majesty   
 3  #model   i love u take with u all the time in ...   
 4             factsguide: society now    #motivation   
 
                                        cleaned_tweet  
 0  father dysfunctional selfish drags kids dysfun...  
 1  thanks lyft credit cant use cause dont offer w...  
 2                                     bihday majesty  
 3                        model love u take u time ur  
 4                      factsguide society motivation  ,
                                                tweet  \
 0  #studiolife #aislife #requires #passion #dedic...   
 1   @user #white #supremacists want everyone to s...   
 2  safe ways to heal your #acne!!    #altwaystohe...   
 3  is the hp and the cursed child book up for res...   
 4    3rd #bihday to my amazing, h

In [None]:
!pip uninstall numpy
!pip install numpy==1.24.3  # Or another compatible version below 2.0


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define parameters
max_vocab_size = 10000  # Maximum vocabulary size for tokenizer
max_sequence_length = 100  # Maximum sequence length for padding

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['cleaned_tweet'])

# Convert text to sequences and pad them
X_train = tokenizer.texts_to_sequences(train_df['cleaned_tweet'])
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length, padding='post', truncating='post')

# Prepare labels
y_train = train_df['label'].values

# Check the shape of the padded sequences and labels
X_train_padded.shape, y_train.shape



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_lo

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_lo

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Admin\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_lo

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Model parameters
embedding_dim = 64
lstm_units = 128

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(units=lstm_units, return_sequences=True),
    Dropout(0.2),
    LSTM(units=lstm_units),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation and test predictions
# Prepare test data
X_test = tokenizer.texts_to_sequences(test_df['cleaned_tweet'])
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length, padding='post', truncating='post')

# Predict sentiment for the test dataset
test_predictions = (model.predict(X_test_padded) > 0.5).astype("int32")

# Save predictions to a DataFrame and export as CSV
test_df['label'] = test_predictions
test_df[['id', 'label']].to_csv('test_predictions.csv', index=False)

print("Model training complete. Predictions saved to 'test_predictions.csv'.")
