In [None]:
import numpy as np
import pandas as pd

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

#model selection
from sklearn.model_selection import train_test_split

#classifiers
from sklearn.linear_model import LogisticRegression


#classification reports
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score

#warnings
import warnings

#settings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
#tweet_df = pd.read_csv(r"C:\Users\hpplc\Downloads\dataset_591_7iqowf3\Dataset\Twitter_Data.csv")
import pandas as pd
import io

#tweet_df = pd.read_csv(io.BytesIO(uploaded["C:\Users\hpplc\Downloads\dataset_591_7iqowf3\Dataset\Twitter_Data.csv]))
#print(tweet_df)
from google.colab import files


uploaded = files.upload()
#tweet_df.head()

Saving Twitter_Data.csv to Twitter_Data.csv


In [None]:
import pandas as pd
import io

tweet_df = pd.read_csv(io.BytesIO(uploaded['Twitter_Data.csv']))
print(tweet_df)


                                               clean_text  category
0       when modi promised “minimum government maximum...      -1.0
1       talk all the nonsense and continue all the dra...       0.0
2       what did just say vote for modi  welcome bjp t...       1.0
3       asking his supporters prefix chowkidar their n...       1.0
4       answer who among these the most powerful world...       1.0
...                                                   ...       ...
162975  why these 456 crores paid neerav modi not reco...      -1.0
162976  dear rss terrorist payal gawar what about modi...      -1.0
162977  did you cover her interaction forum where she ...       0.0
162978  there big project came into india modi dream p...       0.0
162979  have you ever listen about like gurukul where ...       1.0

[162980 rows x 2 columns]


In [None]:
tweet_df['category'] = tweet_df['category'].replace({0: 'Neutral', -1: 'Negative', 1: 'Positive'})
tweet_df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


In [None]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   clean_text  162976 non-null  object
 1   category    162973 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [None]:
tweet_df.isnull().sum()

clean_text    4
category      7
dtype: int64

In [None]:
tweet_df.dropna(inplace=True)

In [None]:
tweet_df.isnull().sum()

clean_text    0
category      0
dtype: int64

In [None]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.lower()  # convert text to lower case
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespaces
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # remove stopwords
    return text

tweet_df['clean_text'] = tweet_df['clean_text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import nltk
nltk.download('punkt')
#Create a new column and find the length of each sentence:
tweet_df['sentence_length'] = tweet_df['clean_text'].apply(lambda x: len(word_tokenize(x)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#Split data into dependent(X) and independent(y) dataframe:
X = tweet_df['clean_text']
y = tweet_df['category']

In [None]:
#Do operations on text data:
#For one-hot encoding and padding, you can use TensorFlow's Tokenizer and pad_sequences:

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_encoded = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_encoded, padding='pre')


In [None]:
#Build an LSTM model:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

vocab_size = len(tokenizer.word_index) + 1
max_length = X_padded.shape[1]

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
#Do dummy variable creation for the dependent variable:
y_encoded = pd.get_dummies(y)

In [None]:
#Split the data into tests and train:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [None]:
#Train the model:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a272669bbb0>

In [None]:
#Normalize the prediction and measure performance metrics:
y_pred = model.predict(X_test)
y_pred_norm = [1 if pred.argmax() == 2 else 0 for pred in y_pred]  # Assuming index 2 corresponds to Positive sentiment



In [None]:
#Print Classification report:
from sklearn.metrics import classification_report
print(classification_report(y_test.idxmax(axis=1), y_pred_norm))

ValueError: Mix of label input types (string and number)

In [None]:
print(y_test.dtypes)
print(type(y_pred_norm[0]))

Negative    uint8
Neutral     uint8
Positive    uint8
dtype: object
<class 'int'>


In [None]:
y_test = y_test.apply(lambda x: x.argmax())

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_norm))

ValueError: Found input variables with inconsistent numbers of samples: [3, 32594]