In [1]:
import pandas as pd
import numpy as np

Positive_sentiment_data =pd.read_csv('pos.csv',header=None,index_col=None)
Negative_sentiment_data=pd.read_csv('neg.csv',header=None,index_col=None)
Neutral_sentiment_data=pd.read_csv('neutral.csv',header=None,index_col=None)


In [2]:
Positive_sentiment_data[0].iloc[0]

'Henry Selick’s first movie since 2009’s Coraline. His fifth stop-motion masterpiece.'

In [3]:
Negative_sentiment_data[0].iloc[1]

"Small pleasures aside, the movie doesn't offer anything particularly memorable or inventive."

In [4]:
Neutral_sentiment_data[0].iloc[4753]

'कहते ैं ुाँ “सहस््ा्ी शहर” ैं – ाा ्ा ा सहस््ा्ी ीी ा े ा ीं ै ैॉ ें ैी ी ै ्् े ा् ो ीं ा ो ि् ाऊटर ा े ा ा ा ूँ ा ै ो ैॉ ा े ा ें े ूिा ीएचबीी ुाँ ा ि्ु् ो् ो ीं ा ा ै ाा ा ु ु ्ा ुाँ ैे े्ो ें ्े ि ाे े ि ्ा – ाँ ो ं े ाी िी ्ा े ि ू ें ो िा ा ा ाे ैं ूिों ो ्ा ाू ि िा े िी ोे ें ेे ाँ ें ा ा् े े्ो े ीं ्ाा िी ी ै '

In [5]:
#!pip install langdetect


In [6]:
import re
import spacy
from contractions import fix
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent language detection results
DetectorFactory.seed = 0

In [7]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

In [8]:
# Function to remove digits from text
def remove_digits(text):
    text = str(text)
    return re.sub(r'\d+', '', text)

# Function to expand contractions (e.g., "don't" → "do not")
def expand_contractions(text):
    return fix(text)

# Function to normalize elongated words (e.g., "soooo" → "soo")
def normalize_elongated_words(text):
    return re.sub(r'(.)\1+', r'\1\1', text)  # replaces "soooo" with "soo"

# Function to remove URLs, mentions (@user), and hashtags (#topic)
def remove_urls_mentions_hashtags(text):
    # Remove URLs (http:// or https:// links)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions (@user) and hashtags (#topic)
    text = re.sub(r'@\w+|#\w+', '', text)
    return text

# Function to remove special characters and punctuation
def remove_special_characters(text):
    return re.sub(r'[^\w\s]', "", text)  # Remove everything except alphanumeric and spaces

# Function to clean HTML tags from text
def remove_html_tags(text):
    return re.sub(r"<.*?>", "", text)

# Function to handle negations
def handle_negations(doc):
    tokens = []
    skip_next = False
    for i, token in enumerate(doc):
        # Skip token if it was handled as a negation (already replaced)
        if skip_next:
            skip_next = False
            continue
        
        # If the token is a negation word (like 'not'), append "not" and the next important word separately
        if token.dep_ == 'neg' and i + 1 < len(doc):
            next_token = doc[i + 1]
            if not next_token.is_stop:
                tokens.append('not')
                tokens.append(next_token.lemma_)
                skip_next = True  # Skip the next token because it's already handled
        # Regular token processing
        elif token.dep_ != 'neg' and not token.is_stop and token.pos_ not in ['PUNCT', 'PRON']:
            tokens.append(token.lemma_)
    return tokens

# Function to retain important POS and handle negations
def retain_important_pos(sentence):
    doc = nlp(sentence.lower())
    # Handle negations and keep adjectives, adverbs, verbs, and nouns
    important_words = handle_negations(doc)
    return ' '.join(important_words)

# Function to remove non-English words
def remove_non_english_words(text):
    """
    Remove words that are not in English from the given text.

    Parameters:
        text (str): The input text.
    
    Returns:
        str: The text containing only English words or those detected as English.
    """
    words = text.split()  # Split text into words
    retained_words = []
    for word in words:
        try:
            # Retain the word if detected as English
            if detect(word) == "en" or word.isascii():
                retained_words.append(word)
        except LangDetectException:
            retained_words.append(word)  # Retain the word if detection fails
    return " ".join(retained_words)


In [10]:
# Function to clean and process the text using all steps
def full_text_cleaning(text):
    # Step 1: Remove digits
    text = remove_digits(text)
    
    # Step 2: Remove URLs, mentions, and hashtags
    text = remove_urls_mentions_hashtags(text)
    
    # Step 3: Remove HTML tags
    text = remove_html_tags(text)
    
    # Step 4: Remove special characters and punctuation
    text = remove_special_characters(text)
    
    # Step 5: Expand contractions
    text = expand_contractions(text)
    
    # Step 6: Normalize elongated words
    text = normalize_elongated_words(text)
    
    # Step 7: Lemmatize the text, retain important POS, and handle negations
    text = retain_important_pos(text)
    
    # Step 8: Filter non-English text
    text = remove_non_english_words(text)
    
    return text

In [11]:
# Apply the full_text_cleaning and split instead of word_tokenize
Positive_sentiment_data[0] = Positive_sentiment_data[0].apply(lambda x: full_text_cleaning(x).split())
Negative_sentiment_data[0] = Negative_sentiment_data[0].apply(lambda x: full_text_cleaning(x).split())
Neutral_sentiment_data[0] = Neutral_sentiment_data[0].apply(lambda x: full_text_cleaning(x).split())


In [12]:
Positive_sentiment_data[0].iloc[0]

['henry', 'selick', 'movie', 'coraline', 'fifth', 'stopmotion', 'masterpiece']

In [13]:
Negative_sentiment_data[0].iloc[0]

['cast',
 'read',
 'like',
 'vogue',
 'oscar',
 'party',
 'guest',
 'list',
 'valentine',
 'day',
 'cantmiss',
 'cinema',
 'instead',
 'standard',
 'hollywood',
 'schmaltz']

In [14]:
Neutral_sentiment_data[0].iloc[4753]

[]

In [15]:
# import the library 'nltk'.if you don't have this library.then
# use the command ' # !pip install nltk '
#from nltk.tokenize import word_tokenize

In [16]:
# combine all three sentiments into single array
combined_data_set = np.concatenate((Positive_sentiment_data[0], Neutral_sentiment_data[0],Negative_sentiment_data[0]))

In [17]:
combined_data_set[0]

['henry', 'selick', 'movie', 'coraline', 'fifth', 'stopmotion', 'masterpiece']

In [18]:
len(Positive_sentiment_data)

28028

In [19]:
len(Negative_sentiment_data)

28926

In [20]:
len(Neutral_sentiment_data)

27673

In [21]:
len(combined_data_set)

84627

In [22]:
#now we are converting positive sentiment to '1', negative sentiment to '-1', and neutral to '0'
converted_data_set = np.concatenate((np.ones(len(Positive_sentiment_data), dtype=int),np.zeros(len(Neutral_sentiment_data),dtype=int),-1*np.ones(len(Negative_sentiment_data), dtype=int)))

In [23]:
len(combined_data_set)

84627

In [24]:
len(converted_data_set)

84627

In [25]:
type(combined_data_set)


numpy.ndarray

In [26]:
#array to list
combined_data_set = combined_data_set.tolist()

In [27]:
type(combined_data_set)


list

In [28]:
converted_data_set[0]

1

In [29]:
converted_data_set[33333]

0

In [30]:
converted_data_set[77777]

-1

In [31]:
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
import multiprocessing

'''
Word2Vec: This is a popular model from the gensim library used for training word embeddings. 
         It converts words into vectors of real numbers in a high-dimensional space, capturing their semantic meaning based on their context in a corpus. 
         It's typically used in NLP tasks.
         
Dictionary: This is a mapping between words and their integer IDs. 
            It helps to create a vocabulary of unique words from a corpus, assigning each word a unique ID, 
            which is useful for topic modeling and other NLP tasks where you need a compact representation of the text.

sequence: This module from Keras provides utilities for processing sequences, like padding or truncating them to a fixed length. 
         It's commonly used for preparing data to be fed into models, especially recurrent neural networks (RNNs),
        where input sequences need to have a uniform length.

multiprocessing: This module allows for parallel execution of code. 
               It can be used to speed up tasks like training the Word2Vec model by leveraging multiple CPU cores.

               '''

"\nWord2Vec: This is a popular model from the gensim library used for training word embeddings. \n         It converts words into vectors of real numbers in a high-dimensional space, capturing their semantic meaning based on their context in a corpus. \n         It's typically used in NLP tasks.\n         \nDictionary: This is a mapping between words and their integer IDs. \n            It helps to create a vocabulary of unique words from a corpus, assigning each word a unique ID, \n            which is useful for topic modeling and other NLP tasks where you need a compact representation of the text.\n\nsequence: This module from Keras provides utilities for processing sequences, like padding or truncating them to a fixed length. \n         It's commonly used for preparing data to be fed into models, especially recurrent neural networks (RNNs),\n        where input sequences need to have a uniform length.\n\nmultiprocessing: This module allows for parallel execution of code. \n        

In [32]:
cpu_count = multiprocessing.cpu_count() #Returns the number of available CPU cores on your system. More cores allow for faster parallel processing, especially when training models.
vocab_dim = 150 # The dimensionality of the word vectors (embeddings) generated by Word2Vec. A higher dimension can capture more information but increases memory and computation.
n_iterations = 15 #The number of passes the Word2Vec algorithm makes over the training data. More iterations can lead to better embeddings but take more time.
n_exposures = 15 #The minimum number of occurrences of a word for it to be included in the model's vocabulary. Words appearing fewer times are ignored.
window_size = 7 # The number of words before and after a target word that Word2Vec will consider as its context. Larger windows capture broader context.
n_epoch = 10 # The number of times the entire dataset is passed through the neural network during training. More epochs can improve the model but may risk overfitting.
input_length = 100 # The length of input sequences for the model. Sentences longer than this will be truncated, and shorter ones will be padded.
maxlen = 100 # The maximum allowable length for input sequences. Sentences longer than this value are truncated, while shorter ones are padded to this length.


In [33]:
def create_dictionaries(model=None,combined_data_set=None):
    if (combined_data_set is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.index_to_key,allow_update=True)
        
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model.wv[word] for word in w2indx.keys()}

        def parse_dataset(combined_data_set): # for transform words to integers
            data=[]
            for sentence in combined_data_set:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) 
                data.append(new_txt)
            return data 
            
        combined_data_set = parse_dataset(combined_data_set)
        combined_data_set = sequence.pad_sequences(combined_data_set, maxlen=maxlen)
        return w2indx, w2vec,combined_data_set
    else:
        print( 'You are not provided any data')



def word2vec_train(combined_data_set):

    model = Word2Vec(vector_size=vocab_dim,
                     min_count=n_exposures,
                     window=window_size,
                     workers=cpu_count,
                     epochs=n_iterations)
    model.build_vocab(combined_data_set) 
    model.train(combined_data_set, total_examples=model.corpus_count, epochs=model.epochs)
    model.save('Word2vec_model.pkl')
    index_dict, word_vectors,combined_data_set = create_dictionaries(model=model,combined_data_set=combined_data_set)
    return   index_dict, word_vectors,combined_data_set

#print ('Training a Word2vec model...')
index_dict, word_vectors,combined_data_set = word2vec_train(combined_data_set)

In [34]:
combined_data_set

array([[   0,    0,    0, ..., 2236, 5803, 3720],
       [   0,    0,    0, ..., 4125, 2074,    0],
       [   0,    0,    0, ..., 5535, 4379, 2867],
       ...,
       [   0,    0,    0, ...,  242, 4484, 2040],
       [   0,    0,    0, ...,    0, 4535,  326],
       [   0,    0,    0, ..., 1022, 3583,    0]])

In [35]:
from sklearn.model_selection  import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Activation
from keras.models import model_from_json
np.random.seed(1337)  # For Reproducibility
import sys
sys.setrecursionlimit(1000000)
import yaml
import keras

In [36]:
batch_size = 32

In [37]:
def get_data(index_dict,word_vectors,X,Y):

    n_symbols = len(index_dict) + 1
    embedding_weights = np.zeros((n_symbols, vocab_dim))
    for word, index in index_dict.items():
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    y_train = keras.utils.to_categorical(y_train,num_classes=3) 
    y_test = keras.utils.to_categorical(y_test,num_classes=3)
    # print x_train.shape,y_train.shape
    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test

In [38]:
n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined_data_set,converted_data_set)

In [54]:
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    # Defining a Simple Keras Model.
    model = Sequential()  
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    model.add(LSTM(units=50, activation='tanh', recurrent_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax')) 
    model.add(Activation('softmax'))

    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',metrics=['accuracy'])


    model.fit(x_train, y_train, batch_size=batch_size, epochs=4,verbose=1)
    # Predicting y_test using X_test
    y_pred = model.predict(x_test, batch_size=batch_size)
    
    # Convert predicted probabilities to class labels (assuming categorical)
    y_pred_classes = y_pred.argmax(axis=-1)
    
    # Convert one-hot encoded y_test to class labels
    y_test_classes = y_test.argmax(axis=-1)
    
    # Calculate accuracy between predicted y_test and actual y_test
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test_classes, y_pred_classes)

    #print( "Evaluate...")
    score = model.evaluate(x_test, y_test,batch_size=batch_size)
                                

    json_string = model.to_json()
    with open('lstm.json', 'w') as json_file:
        json_file.write( json_string)
    model.save_weights('lstm.weights.h5')
    print ('Test score:')
    print(score)
    print(f'Accuracy: {accuracy * 100:.2f}%')


In [55]:
train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)

Epoch 1/4




[1m2116/2116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 31ms/step - accuracy: 0.7271 - loss: 0.8325
Epoch 2/4
[1m2116/2116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 32ms/step - accuracy: 0.8487 - loss: 0.7063
Epoch 3/4
[1m2116/2116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 32ms/step - accuracy: 0.8792 - loss: 0.6738
Epoch 4/4
[1m2116/2116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 31ms/step - accuracy: 0.8991 - loss: 0.6533
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step
[1m529/529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8574 - loss: 0.6917
Test score:
[0.6872574687004089, 0.8628146052360535]
Accuracy: 86.28%


In [56]:
def input_transform(string):
    # Step 1: Clean the input string using the new cleaning pipeline
    cleaned_text = full_text_cleaning(string)
    
    # Step 2: Tokenize the cleaned string (already lemmatized and cleaned)
    words = cleaned_text.split()  # Tokenize based on space since SpaCy has cleaned it already
    
    # Step 3: Reshape the tokens for the model
    words_array = np.array(words).reshape(1, -1)
    # Load the pre-trained Word2Vec model
    model = Word2Vec.load('Word2vec_model.pkl')
    
    # Step 4: Use the pre-trained Word2Vec model (or any model) for vector representation
    _, _, combined_data_set = create_dictionaries(model, words_array)
    
    return combined_data_set

In [57]:
from keras.models import model_from_json
import streamlit as st

def lstm_predict(string):
    # Load the model architecture from JSON
    with open('lstm.json', 'r') as json_file:
        model_json = json_file.read()
    model = model_from_json(model_json)

    # Load the weights
    model.load_weights('lstm.weights.h5')
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])

    # Preprocess the input
    data = input_transform(string)
    print(f"Input shape before reshaping: {data.shape}")
    data = data.reshape(1, -1)  # Ensure correct input shape

    # Get prediction from model
    prediction = model.predict(data)
    result = np.argmax(prediction, axis=-1)

    # Map result to sentiment
    if result[0] == 1:
        print( "Positive")
    elif result[0] == 0:
        print( "Neutral")
    else:
        print( "Negative")



In [58]:
s = " Hi, Brother "
lstm_predict(s) 

Input shape before reshaping: (1, 100)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
Neutral


In [60]:
s = " happy be happy "
lstm_predict(s) 

Input shape before reshaping: (1, 100)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
Positive


In [64]:
s = " bad to bad "
lstm_predict(s) 

Input shape before reshaping: (1, 100)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step
Negative
