### imports

In [2]:

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Corpus Processing
import re
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')

# from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [1]:
import tensorflow as tf
import keras
from tensorflow.keras import layers

# from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization




### preprocessing

In [3]:
reviews = pd.read_csv('../datasets/annotated_data_sentiment.csv', encoding='utf-8')
reviews.head(5)

Unnamed: 0,index,created_at,custom_comment,sentiment
0,30,01/03/2023 22:06,Great guy!!!,1
1,33,02/03/2023 06:46,he is the best driver,1
2,45,02/03/2023 14:15,The best driver!),1
3,52,02/03/2023 17:11,Perfect. Recommend,1
4,53,02/03/2023 17:41,Perfect trip,1


In [4]:
stopwords = ['a', 'about', 'an', 'am' 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

short_forms = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "didn't": "did not",
    "doesn't": "does not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "it's": "it is",
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "we're": "we are",
    "they're": "they are",
    "I've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "couldn't": "could not",
    "should've": "should have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    # Add more short forms and their full forms as needed
}

def make_lower(text):
     return text.lower()
     

def replace_short_forms(text):
    # Create a regular expression pattern to match short forms as standalone words
    pattern = r'\b(?:{})\b'.format('|'.join(short_forms.keys()), re.IGNORECASE)
    
    # Replace short forms with their corresponding full forms using a lambda function
    full_forms_text = re.sub(pattern, lambda match: short_forms[match.group(0)], text)
    
    return full_forms_text


# (?) remove quotation marks, unnecessary punctuation, [{}[]\/+*%|^%#@!?()]
def punctuation_remover(text):
    pattern = r'[{}\[\]\\\/\+\*%\|\^%#@\(\)\$\?\!\"]'
    return re.sub(pattern, '', text)


def lemma_stopwords_token(text):
      le=WordNetLemmatizer()
      word_tokens=nltk.word_tokenize(text)
      word_tokens =[token for token in word_tokens if token.isalpha()]
      tokens=[le.lemmatize(token) for token in word_tokens if token not in stopwords and len(token)>2]
      processed_text =" ".join(tokens)
      return processed_text


# main preprocessing function
def preprocessing(text):
    reviews = make_lower(text)
    reviews = replace_short_forms(reviews)
    reviews = punctuation_remover(reviews)
    reviews = lemma_stopwords_token(reviews)
    return reviews

In [5]:
text = "he's She's The ?? wor@@!@!st driver"
new_text = preprocessing(text)
print(new_text)
type(new_text)

worst driver


str

In [6]:
data = [preprocessing(custom_comment) for custom_comment in reviews['custom_comment'].to_list()]
# data = reviews['custom_comment'].values.tolist()
labels = reviews['sentiment'].values.tolist()

labels = [0 if label == -1 else label for label in labels]
labels = np.array(labels)
data = np.array(data)
print(data)

['great guy' 'best driver' 'best driver' ...
 'driver cancelled trip last second then missed appointment doctor veey bad service'
 'unprofessional and rude'
 'inexperienced driver too quick city and doe not break turn made drive absolutely uncomfortable']


In [7]:
# constants
embedding_dim = 50   #hyperparameter representing the dimensionality of the embedding space, in NLP tasks often set to a value between 50 and 300
                    #each word index in the vocabulary will be represented as a 50-dimensional vector in the embedding space
sequence_length =  100 #Keeping a fixed length of all reviews to max 400 words
max_vocab_len = 10000  # vocabulary size limits the number of unique tokens (words or subwords)
                    # Setting an upper limit helps control the size of the model and prevents it from learning an excessively large vocabulary.
batch_size = 64 #Number of samples to work through before updating the internal model parameters via back propagation. The higher the batch, the more memory you need.
epochs = 6

In [8]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# X_train_dataset = tf.data.Dataset.from_tensor_slices(X_train)


In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4125,), (1032,), (4125,), (1032,))

### vectorizing layer

In [10]:
# instantiate text vectorization layer
# We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# we're using the default split function, + custom preprocessing

vectorize_layer = keras.layers.TextVectorization(
    standardize='lower',
    max_tokens=max_vocab_len - 1,
    output_mode="int",
    output_sequence_length=sequence_length)

# Now that the vectorize_layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# fit the text vector to the training text
vectorize_layer.adapt(X_train)





In [11]:
#  create a function to see the result of using this layer to preprocess some data.
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# Vectorize the data if not include in Model
train_ds = train_dataset.map(vectorize_text)
test_ds = test_dataset.map(vectorize_text)

In [12]:
import random
random_sentence = random.choice(X_train)
print(f"original text : \n {random_sentence} \n\n Vectorized format : \n{vectorize_layer([random_sentence])} \n\n shape : {vectorize_layer([random_sentence]).shape}")

original text : 
 texting while driving 

 Vectorized format : 
[[425  66  16   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]] 

 shape : (1, 100)


In [13]:
print(f" 5 most common words : {vectorize_layer.get_vocabulary()[:5]} \n \n 5 least common words : {vectorize_layer.get_vocabulary()[-5:]}")

 5 most common words : ['', '[UNK]', 'and', 'driver', 'not'] 
 
 5 least common words : ['abruptly', 'abrupt', 'abroad', 'ability', 'ab']


### build and train

In [None]:
# lstm model without PreProcessing layer

lstm_model = Sequential([
    layers.Embedding( 
    input_dim = max_vocab_len + 1, # int, the size of our vocabulary, maximum integer index + 1 
    output_dim = embedding_dim, # int, dimensions to which each words shall be mapped
    input_length = sequence_length, #Length of input sequences
    mask_zero=True #to ignore padding
    ),
    layers.LSTM(units=64, return_sequences=True),
    layers.LSTM(units=32),
    layers.Dense(units=32, activation="relu"),
    layers.Dropout(rate=0.25),
    layers.Dense(units=1, activation="sigmoid")])

# compiling model
lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), #1e-3 = 0.001
    loss=tf.keras.losses.BinaryCrossentropy(),
    # metrics=[tf.keras.metrics.Accuracy(),
    #          tf.keras.metrics.FalseNegatives(),
    metrics=["accuracy"]) 

# Display a summary of the models structure
lstm_model.summary()

In [15]:
# lstm model with PreProcessing layer
inputs = layers.Input(shape = (1,), dtype = 'string')
x = vectorize_layer(inputs)
x = layers.Embedding( 
    input_dim = max_vocab_len + 1, # int, the size of our vocabulary, maximum integer index + 1 
    output_dim = embedding_dim, # int, dimensions to which each words shall be mapped
    input_length = sequence_length, #Length of input sequences
    mask_zero=True #to ignore padding
    )(x)
x = layers.LSTM(units=64, return_sequences=True)(x)
x = layers.LSTM(units=32)(x)
x = layers.Dense(units=32, activation = 'relu')(x)
x = layers.Dropout(rate=0.25)(x)

predictions = layers.Dense(units=1, activation="sigmoid")(x)

lstm_model = tf.keras.Model(inputs, predictions, name = 'LSTM_MODEL')
lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), #1e-3 = 0.001
    loss=tf.keras.losses.BinaryCrossentropy(),
    # metrics=[tf.keras.metrics.Accuracy(),
    #          tf.keras.metrics.FalseNegatives(),
    metrics=["accuracy"])

lstm_model.summary()

Model: "LSTM_MODEL"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 100)               0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 100, 50)           500050    
                                                                 
 lstm_2 (LSTM)               (None, 100, 64)           29440     
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                        

In [16]:
# training the model
history = lstm_model.fit(X_train, y_train, #train_dataset#Training data : features (review) and classes (positive or negative)
                    batch_size=batch_size, #Number of samples to work through before updating the internal model parameters via back propagation. The higher the batch, the more memory you need.
                    epochs=epochs,
                    verbose=1) 


Epoch 1/6

Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [17]:
results = lstm_model.evaluate(X_test, y_test)
print ('Test loss: {0}, Test accuracy: {1}'.format(results[0],results[1]))

Test loss: 0.27682918310165405, Test accuracy: 0.942829430103302


In [20]:
class Config:
    WORK_DIR = 'D:/DEV2/Text-Classification-project/'
    DATASET_SENTIMENT = WORK_DIR + '/datasets/annotated_data_sentiment.csv'
    DATASET_TOPIC = WORK_DIR + '/datasets/annotated_data_topic_classification.csv'
    SRC_PATH = WORK_DIR + '/src'
    SENTIMENT_MODEL_PATH = WORK_DIR + '/models/sentiment_model'
    TOPIC_MODEL_PATH = WORK_DIR + '/models/topic_model'

# constants
embedding_dim = 50
sequence_length = 100 
max_vocab_len = 10000
batch_size = 64
epochs = 6

In [112]:
def preprocess_data(file_path):
    # read CSV into pandas dataframe
    reviews = pd.read_csv(file_path, encoding='utf-8')

    # preprocess text with methods defined in preprocessing.py
    data = [preprocessing(custom_comment) for custom_comment in reviews['custom_comment'].to_list()]
    labels = reviews['sentiment'].values.tolist()
    labels = [0 if label == -1 else label for label in labels]

    # convert labels and data into numpy arrays
    labels = np.array(labels)
    data = np.array(data)

    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test 


def build_lstm_model(max_vocab_len, embedding_dim, sequence_length):
    # lstm model with PreProcessing layer
    inputs = layers.Input(shape = (1,), dtype = 'string')
    x = vectorize_layer(inputs)
    x = layers.Embedding( 
        input_dim = max_vocab_len + 1, # int, the size of our vocabulary, maximum integer index + 1 
        output_dim = embedding_dim, # int, dimensions to which each words shall be mapped
        input_length = sequence_length, #Length of input sequences
        mask_zero=True #to ignore padding
        )(x)
    x = layers.LSTM(units=64, return_sequences=True)(x)
    x = layers.LSTM(units=32)(x)
    x = layers.Dense(units=32, activation = 'relu')(x)
    x = layers.Dropout(rate=0.25)(x)

    predictions = layers.Dense(units=1, activation="sigmoid")(x)

    lstm_model = tf.keras.Model(inputs, predictions, name = 'LSTM_MODEL')
    lstm_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), #1e-3 = 0.001
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=["accuracy"])

    lstm_model.summary()
    return lstm_model


def train_lstm_model(lstm_model, X_train, y_train, batch_size, epochs, verbose=1):
    history = lstm_model.fit(X_train, y_train, 
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=verbose)
    
    results = lstm_model.evaluate(X_test, y_test)
    print ('Test loss: {0}, Test accuracy: {1}'.format(results[0],results[1]))

    return history



if __name__ == "__main__":

    # define the path to the data
    file_path = '../datasets/annotated_data_sentiment.csv'

    # Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(Config.DATASET_SENTIMENT)

    # Build LSTM model
    lstm_model = build_lstm_model(max_vocab_len, embedding_dim, sequence_length)

    # Train LSTM model
    history = train_lstm_model(lstm_model, X_train, y_train, batch_size, epochs)


Model: "LSTM_MODEL"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_3 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding_4 (Embedding)     (None, 100, 50)           500050    
                                                                 
 lstm_8 (LSTM)               (None, 100, 64)           29440     
                                                                 
 lstm_9 (LSTM)               (None, 32)                12416     
                                                                 
 dense_8 (Dense)             (None, 32)                1056      
                                                        

In [None]:
class SentimentClassificationModelBuid:
    def __init__(self, max_vocab_len, embedding_dim, sequence_length, batch_size, epochs, work_dir):
        self.max_vocab_len = Config.max_vocab_len
        self.embedding_dim = Config.embedding_dim
        self.sequence_length = Config.sequence_length
        self.batch_size = Config.batch_size
        self.epochs = Config.epochs
        self.work_dir = Config.WORK_DIR
        self.dataset_sentiment = Config.DATASET_SENTIMENT
        self.vectorize_layer = layers.TextVectorization(
            max_tokens=max_vocab_len,
            output_mode='int',
            output_sequence_length=sequence_length
        )



    def preprocess_data(file_path):
        # read CSV into pandas dataframe
        reviews = pd.read_csv(file_path, encoding='utf-8')

        # preprocess text with methods defined in preprocessing.py
        data = [preprocessing(custom_comment) for custom_comment in reviews['custom_comment'].to_list()]
        labels = reviews['sentiment'].values.tolist()
        labels = [0 if label == -1 else label for label in labels]

        # convert labels and data into numpy arrays
        labels = np.array(labels)
        data = np.array(data)

        # split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

        return X_train, X_test, y_train, y_test 


    def build_lstm_model(max_vocab_len, embedding_dim, sequence_length):
        # lstm model with PreProcessing layer
        inputs = layers.Input(shape = (1,), dtype = 'string')
        x = vectorize_layer(inputs)
        x = layers.Embedding( 
            input_dim = max_vocab_len + 1, # int, the size of our vocabulary, maximum integer index + 1 
            output_dim = embedding_dim, # int, dimensions to which each words shall be mapped
            input_length = sequence_length, #Length of input sequences
            mask_zero=True #to ignore padding
            )(x)
        x = layers.LSTM(units=64, return_sequences=True)(x)
        x = layers.LSTM(units=32)(x)
        x = layers.Dense(units=32, activation = 'relu')(x)
        x = layers.Dropout(rate=0.25)(x)

        predictions = layers.Dense(units=1, activation="sigmoid")(x)

        lstm_model = tf.keras.Model(inputs, predictions, name = 'SENTIMENT_MODEL')
        lstm_model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), #1e-3 = 0.001
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=["accuracy"])

        lstm_model.summary()
        return lstm_model


    def train_lstm_model(lstm_model, X_train, y_train, batch_size, epochs, verbose=1):
        history = lstm_model.fit(X_train, y_train, 
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=verbose)
        
        results = lstm_model.evaluate(X_test, y_test)
        print ('Test loss: {0}, Test accuracy: {1}'.format(results[0],results[1]))

        return history



    if __name__ == "__main__":

        # define the path to the data
        file_path = '../datasets/annotated_data_sentiment.csv'

        # Preprocess data
        X_train, X_test, y_train, y_test = preprocess_data(Config.DATASET_SENTIMENT)

        # Build LSTM model
        lstm_model = build_lstm_model(max_vocab_len, embedding_dim, sequence_length)

        # Train LSTM model
        history = train_lstm_model(lstm_model, X_train, y_train, batch_size, epochs)


### save, load, test

In [21]:
lstm_model.save(Config.SENTIMENT_MODEL_PATH, 'lstm_model', save_format='tf')
# lstm_model.save('lstm_model', save_format='tf')

INFO:tensorflow:Assets written to: D:/DEV2/Text-Classification-project//models/sentiment_model\assets


INFO:tensorflow:Assets written to: D:/DEV2/Text-Classification-project//models/sentiment_model\assets


In [16]:
new_lstm_model = tf.keras.models.load_model('lstm_model')

# Show the model architecture
new_lstm_model.summary()

Model: "LSTM_MODEL"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_3 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, 100, 50)           500050    
                                                                 
 lstm_6 (LSTM)               (None, 100, 64)           29440     
                                                                 
 lstm_7 (LSTM)               (None, 32)                12416     
                                                                 
 dense_6 (Dense)             (None, 32)                1056      
                                                        

In [17]:
examples = [
  "The ride was great!",
  "The ride was okay.",
  "the ride was terrible...",
  # "driver is a scam",
  # "reckless driver!",
  # "it took longer time, but the driver was good",
  # "vehicle of premium class",
  # "Great experience.I'd recommend this service to my friends.",
  # "The driver swore and racist!",
  # "The drivers behaviour is inappropriate",
  "smooth ride",
]



preprocessed_examples = []
for example in examples:
    preprocessed_example = preprocessing(example)
    preprocessed_examples.append(preprocessed_example)

print(preprocessed_examples)
new_lstm_model.predict(preprocessed_examples)

['ride great', 'ride okay', 'ride terrible', 'smooth ride']


array([[0.9935254 ],
       [0.96347564],
       [0.40340927],
       [0.9866237 ]], dtype=float32)

In [None]:
# from preprocessing import preprocessing

def preprocess_new_data(new_data):
    preprocessed_data = [preprocessing(custom_comment) for custom_comment in new_data]
    return preprocessed_data


def predict_on_new_data(lstm_model, preprocessed_data):
    # Make predictions
    predictions = lstm_model.predict(preprocessed_data)
    if predictions[0] <= 0.5:
        sentiment = "negative"
    else:
        sentiment = "positive"
    return f"Sentiment: {sentiment}"


if __name__ == "__main__":

    # load the model
    lstm_model = tf.keras.models.load_model('lstm_model')
    
    # new_data = input("Please, insert you review: ... ")
    new_data = ["reckless driver!",
                """The drivers behaviour is inappropriate""",
                """I had a couple of rides with your service before and they were nice, but this time there wasn't a seatbelt which I believe is totally not OK. and the driver just said Are yoou going or not??""",
                """The driver swore and racist!""",
                """Nice driver, really helpful""",
                """vehicle of premium class""",
                """Great experience.I'd recommend this service to my friends.""",
                """smooth ride""",
                """it took longer time, but the driver was good"""]

    # preprocess new data
    preprocessed_new_data = preprocess_new_data(new_data)

    # Make predictions on new data
    predictions = predict_on_new_data(lstm_model, preprocessed_new_data)
    print (predictions)