In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from utils import preprocess_data, extract_features
from datasets import load_dataset

[nltk_data] Downloading package punkt to /Users/markmaci/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/markmaci/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
dataset = load_dataset("sentiment140")
df = pd.DataFrame(dataset['train'])

print(df.head())


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


                                                text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                           date             user  sentiment     query  
0  Mon Apr 06 22:19:45 PDT 2009  _TheSpecialOne_          0  NO_QUERY  
1  Mon Apr 06 22:19:49 PDT 2009    scotthamilton          0  NO_QUERY  
2  Mon Apr 06 22:19:53 PDT 2009         mattycus          0  NO_QUERY  
3  Mon Apr 06 22:19:57 PDT 2009          ElleCTF          0  NO_QUERY  
4  Mon Apr 06 22:19:57 PDT 2009           Karoli          0  NO_QUERY  


In [3]:
# Rename 'sentiment' to 'target'
df = df.rename(columns={"sentiment": "target"})

# Drop rows with NaN values in 'target' and 'text'
df.dropna(subset=['target', 'text'], inplace=True)

# Preprocess data
df = preprocess_data(df)


In [4]:
# Feature extraction method: 'tfidf' or 'glove'
feature_method = 'tfidf'
glove_path = '../data/glove.840B.300d.txt'

# Extract features
X = extract_features(df, method=feature_method, glove_path=glove_path)
y = df['target'].values

# Encode the target labels
le = LabelEncoder()
y = le.fit_transform(y)

# Ensure feature matrix X and target vector y are of the same length
assert len(X) == len(y), "Mismatch in lengths of X and y"

print("Feature extraction complete. Proceeding with training...")

Feature extraction complete. Proceeding with training...


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
simple_model = Sequential([
    Dense(10, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(1, activation='sigmoid')
])

simple_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-05-27 21:35:29.306647: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-05-27 21:35:29.306731: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-05-27 21:35:29.306736: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-05-27 21:35:29.306761: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-27 21:35:29.306810: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
# Make predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n {report}")


In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()
