# Importing libraries

In [66]:
import pandas as pd
import numpy as np
import re  # Added for regex
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.regularizers import l2

# Reading the Dataset

In [27]:
df = pd.read_csv("twitter_training.csv")
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [28]:
df.rename(columns={"2401": "ID", "Borderlands": "Company","Positive":"Sentiment","im getting on borderlands and i will murder you all ,":"Text"},inplace=True)

In [29]:
df.head()

Unnamed: 0,ID,Company,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [30]:
df["Sentiment"].value_counts()

Sentiment
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [31]:
#dropping "irrelevnat" sentiment
df = df[df["Sentiment"]!="Irrelevant"]


df["Sentiment"].value_counts()

Sentiment
Negative    22542
Positive    20831
Neutral     18318
Name: count, dtype: int64

# Text cleaning and Sentiment Encoding

In [32]:
df["Text"] = df["Text"].str.lower()
df["Text"] = df["Text"].replace(r'[^a-z0-9\s]', '', regex=True)

In [35]:
df["Sentiment"]=df["Sentiment"].map({"Positive":1,"Negative":0,"Neutral":2})

In [36]:
df

Unnamed: 0,ID,Company,Sentiment,Text
0,2401,Borderlands,1,i am coming to the borders and i will kill you...
1,2401,Borderlands,1,im getting on borderlands and i will kill you all
2,2401,Borderlands,1,im coming on borderlands and i will murder you...
3,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,1,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,1,just realized that the windows partition of my...
74677,9200,Nvidia,1,just realized that my mac window partition is ...
74678,9200,Nvidia,1,just realized the windows partition of my mac ...
74679,9200,Nvidia,1,just realized between the windows partition of...


In [37]:
df.isna().sum()

ID             0
Company        0
Sentiment      0
Text         571
dtype: int64

In [38]:
df.dropna(inplace=True)
df.isna().sum()

ID           0
Company      0
Sentiment    0
Text         0
dtype: int64

# Tokenization and Padding

In [45]:
# shuffle the DataFrame rows
df = df.sample(frac = 1)

In [46]:
max_features = 5000  
max_length = 200    

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df["Text"])
X = pad_sequences(tokenizer.texts_to_sequences(df["Text"]), maxlen=max_length)
y = df['Sentiment'].values

In [47]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [67]:
model = Sequential([
    Embedding(input_dim=max_features, output_dim=16, input_length=max_length),
    SimpleRNN(64, activation='tanh', return_sequences=False, kernel_regularizer=l2(0.01)), 
    Dense(3, activation='softmax', kernel_regularizer=l2(0.01)) 
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)


In [68]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)


Epoch 1/5
[1m1528/1528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 33ms/step - accuracy: 0.4603 - loss: 1.1275 - val_accuracy: 0.6985 - val_loss: 0.7908
Epoch 2/5
[1m1528/1528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 30ms/step - accuracy: 0.7169 - loss: 0.7551 - val_accuracy: 0.7312 - val_loss: 0.7167
Epoch 3/5
[1m1528/1528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 28ms/step - accuracy: 0.7405 - loss: 0.7137 - val_accuracy: 0.7460 - val_loss: 0.6982
Epoch 4/5
[1m1528/1528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 28ms/step - accuracy: 0.7762 - loss: 0.6311 - val_accuracy: 0.7223 - val_loss: 0.7439
Epoch 5/5
[1m1528/1528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 29ms/step - accuracy: 0.7886 - loss: 0.6148 - val_accuracy: 0.7511 - val_loss: 0.6781


# Try the model on Test data

In [50]:
#read the data
df_test = pd.read_csv("twitter_validation.csv",names=["ID","Company","Sentiment","Text"])
df_test.head()

Unnamed: 0,ID,Company,Sentiment,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [51]:
#dropping "irrelevnat" sentiment
df_test = df_test[df_test["Sentiment"]!="Irrelevant"]

In [52]:
df_test["Text"] = df_test["Text"].str.lower()
df_test["Text"] = df_test["Text"].replace(r'[^a-z0-9\s]', '', regex=True)

In [53]:
df_test["Sentiment"]=df_test["Sentiment"].map({"Positive":1,"Negative":0,"Neutral":2})

In [54]:
df_test.dropna(inplace=True)

In [55]:
df_test = df_test.sample(frac = 1)

In [56]:
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test["Text"]), maxlen=max_length)
y_test = df_test['Sentiment'].values

In [69]:
score = model.evaluate(x_test, y_test, verbose=0)
print(f"Test accuracy: {score[1]:.2f}")

Test accuracy: 0.82


In [70]:
y_pred = model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


In [71]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       266
           1       0.77      0.90      0.83       277
           2       0.92      0.68      0.79       285

    accuracy                           0.82       828
   macro avg       0.83      0.82      0.82       828
weighted avg       0.83      0.82      0.82       828

