<a href="https://colab.research.google.com/github/murali-marimekala/ml_sentiment_analysis/blob/main/notebooks/ml_logistic_regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow-text==2.13.*

Collecting tensorflow-text==2.13.*
  Downloading tensorflow_text-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting tensorflow<2.14,>=2.13.0 (from tensorflow-text==2.13.*)
  Downloading tensorflow-2.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow<2.14,>=2.13.0->tensorflow-text==2.13.*)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.14,>=2.13.1 (from tensorflow<2.14,>=2.13.0->tensorflow-text==2.13.*)
  Downloading keras-2.13.1-py3-none-any.whl.metadata (2.4 kB)
Collecting numpy<=1.24.3,>=1.22 (from tensorflow<2.14,>=2.13.0->tensorflow-text==2.13.*)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting tensorboard<2.14,>=2.13 (from tensorflow<2.14,>=2.13.0->tensorflow-text==2.13.*)
  Downloading tensorboard-2.13.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-es

In [3]:
import pandas as pd
import zipfile
from google.colab import files
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score

In [4]:
# Prompt user to upload the zip file
uploaded = files.upload()

Saving training_dataset_1.6m.zip to training_dataset_1.6m.zip


In [5]:
# Extract the uploaded zip file
for filename in uploaded.keys():
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('data')

In [6]:
# Load dataset
data = pd.read_csv('data/training_dataset_1.6m.csv', encoding='latin-1')
data.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

In [7]:
# Preprocessing
X = data['text']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

In [9]:
# Model Building
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [10]:
# Model Compilation
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Training
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5

In [None]:
# Prediction
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

In [None]:
# Example Prediction
example_text = ["I love this product!"]
example_seq = tokenizer.texts_to_sequences(example_text)
example_pad = pad_sequences(example_seq, maxlen=100)
print(f'Predicted Sentiment: {model.predict(example_pad)}')