In [None]:

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Corpus Processing
import re
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')

# from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
import tensorflow as tf
import keras
from tensorflow.keras import layers

# from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

### preprocessing

In [3]:
# loading dataset
reviews = pd.read_csv("../datasets/annotated_data_topic_classification.csv", encoding='utf-8')
reviews.head(5)

Unnamed: 0,index,custom_comment,sentiment,1 Pricing and Fairness,2 Driver professionalism,3 Driver behaviour,4 Customer Service,5 Application,6 Lost things,7 Vehicle Condition,8 Safety & reliability,9 General bad,10 Other
0,49,The driver didn’t pay back the rest of the money,-1,1,0,1,0,0,0,0,0,0,0
1,2247,He is the worst driver in Utaxi and I would de...,-1,0,0,1,0,0,0,0,0,0,0
2,2272,A simple “hello” or response would be perfect....,-1,0,0,1,0,0,0,0,0,0,0
3,3229,Not so great service,-1,0,0,0,0,0,0,0,0,1,0
4,3576,"Worst driver ever, he’s a racist was screaming...",-1,0,0,1,0,0,0,0,0,0,0


In [4]:
stopwords = ['a', 'about', 'an', 'am' 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

short_forms = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "didn't": "did not",
    "doesn't": "does not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "it's": "it is",
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "we're": "we are",
    "they're": "they are",
    "I've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "couldn't": "could not",
    "should've": "should have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    # Add more short forms and their full forms as needed
}

def make_lower(text):
     return text.lower()
     

def replace_short_forms(text):
    # Create a regular expression pattern to match short forms as standalone words
    pattern = r'\b(?:{})\b'.format('|'.join(short_forms.keys()), re.IGNORECASE)
    
    # Replace short forms with their corresponding full forms using a lambda function
    full_forms_text = re.sub(pattern, lambda match: short_forms[match.group(0)], text)
    
    return full_forms_text


# (?) remove quotation marks, unnecessary punctuation, [{}[]\/+*%|^%#@!?()]
def punctuation_remover(text):
    pattern = r'[{}\[\]\\\/\+\*%\|\^%#@\(\)\$\?\!\"]'
    return re.sub(pattern, '', text)


def lemma_stopwords_token(text):
      le=WordNetLemmatizer()
      word_tokens=nltk.word_tokenize(text)
      word_tokens =[token for token in word_tokens if token.isalpha()]
      tokens=[le.lemmatize(token) for token in word_tokens if token not in stopwords and len(token)>2]
      processed_text =" ".join(tokens)
      return processed_text


# main preprocessing function
def preprocessing(text):
    reviews = make_lower(text)
    reviews = replace_short_forms(reviews)
    reviews = punctuation_remover(reviews)
    reviews = lemma_stopwords_token(reviews)
    return reviews

In [5]:
# take reviews from dataset and preprocess them + convert to list + numpy.ndarray
data = [preprocessing(custom_comment) for custom_comment in reviews['custom_comment'].to_list()]
data = np.array(data)

topics = reviews.columns.values[3:].tolist()
labels = reviews[topics].values
labels = np.array(labels)



In [100]:
print(topics)
type(data)

['1 Pricing and Fairness', '2 Driver professionalism', '3 Driver behaviour', '4 Customer Service', '5 Application', '6 Lost things', '7 Vehicle Condition', '8 Safety & reliability', '9 General bad', '10 Other']


numpy.ndarray

In [6]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# X_train_dataset = tf.data.Dataset.from_tensor_slices(X_train)


In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3808,), (952,), (3808, 10), (952, 10))

#### Vectorization

In [8]:
# constants
embedding_dim = 50   #hyperparameter representing the dimensionality of the embedding space, in NLP tasks often set to a value between 50 and 300
                    #each word index in the vocabulary will be represented as a 50-dimensional vector in the embedding space
sequence_length =  100 #Keeping a fixed length of all reviews to max 400 words
max_vocab_len = 10000  # vocabulary size limits the number of unique tokens (words or subwords)
                    # Setting an upper limit helps control the size of the model and prevents it from learning an excessively large vocabulary.
batch_size = 64 #Number of samples to work through before updating the internal model parameters via back propagation. The higher the batch, the more memory you need.
epochs = 10

In [9]:
# instantiate text vectorization layer

vectorize_layer = keras.layers.TextVectorization(
    standardize='lower',
    max_tokens=max_vocab_len - 1,
    output_mode="int",
    output_sequence_length=sequence_length)

# fit the text vector to the training text
vectorize_layer.adapt(X_train)





In [10]:
#  create a function to see the result of using this layer to preprocess some data.
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# Vectorize the data if not include in Model
train_ds = train_dataset.map(vectorize_text)
test_ds = test_dataset.map(vectorize_text)

In [11]:
import random
random_sentence = random.choice(X_train)
print(f"original text : \n {random_sentence} \n\n Vectorized format : \n{vectorize_layer([random_sentence])} \n\n shape : {vectorize_layer([random_sentence]).shape}")

original text : 
 advertising other service 

 Vectorized format : 
[[18 13  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]] 

 shape : (1, 100)


In [12]:
print(f" 5 most common words : {vectorize_layer.get_vocabulary()[:5]} \n \n 5 least common words : {vectorize_layer.get_vocabulary()[-5:]}")

 5 most common words : ['', '[UNK]', 'and', 'driver', 'not'] 
 
 5 least common words : ['abrupt', 'abroad', 'ability', 'abandoned', 'ab']


### build and train

In [13]:
# lstm model with PreProcessing layer
inputs = layers.Input(shape = (1,), dtype = 'string')
x = vectorize_layer(inputs)
x = layers.Embedding( 
    input_dim = max_vocab_len + 1, # int, the size of our vocabulary, maximum integer index + 1 
    output_dim = embedding_dim, # int, dimensions to which each words shall be mapped
    input_length = sequence_length, #Length of input sequences
    mask_zero=True #to ignore padding
    )(x)
x = layers.LSTM(units=64, return_sequences=True)(x)
x = layers.LSTM(units=32)(x)
x = layers.Dense(units=32, activation = 'relu')(x)
x = layers.Dropout(rate=0.25)(x)

predictions = layers.Dense(units=len(topics), activation="sigmoid")(x)

model = tf.keras.Model(inputs, predictions, name = 'TOPIC_MODEL')
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), #1e-3 = 0.001
    loss=tf.keras.losses.BinaryCrossentropy(),
    # metrics=[tf.keras.metrics.Accuracy(),
    #          tf.keras.metrics.FalseNegatives(),
    metrics=["accuracy"])

model.summary()

Model: "TOPIC_MODEL"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 100)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 100, 50)           500050    
                                                                 
 lstm (LSTM)                 (None, 100, 64)           29440     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                       

In [14]:
# training the model
history = model.fit(X_train, y_train, #train_dataset#Training data : features (review) and classes (positive or negative)
                    batch_size=batch_size, #Number of samples to work through before updating the internal model parameters via back propagation. The higher the batch, the more memory you need.
                    epochs=epochs,
                    verbose=1) 


Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
results = model.evaluate(X_test, y_test)
print ('Test loss: {0}, Test accuracy: {1}'.format(results[0],results[1]))

Test loss: 0.19560971856117249, Test accuracy: 0.6838235259056091


In [54]:
class Config:
    WORK_DIR = 'D:/DEV2/Text-Classification-project/'
    DATASET_SENTIMENT = WORK_DIR + '/datasets/annotated_data_sentiment.csv'
    DATASET_TOPIC = WORK_DIR + '/datasets/annotated_data_topic_classification.csv'
    SRC_PATH = WORK_DIR + '/src'
    SENTIMENT_MODEL_PATH = WORK_DIR + '/models/sentiment_model'
    TOPIC_MODEL_PATH = WORK_DIR + '/models/topic_model'

    # constants
    embedding_dim = 50
    sequence_length = 100 
    max_vocab_len = 10000
    batch_size = 64
    epochs = 10
    num_topics = 10
    TOPICS = ['1 Pricing and Fairness', '2 Driver professionalism', '3 Driver behaviour', '4 Customer Service', '5 Application', '6 Lost things', '7 Vehicle Condition', '8 Safety & reliability', '9 General bad', '10 Other']
  

[[0.4641065]]


In [None]:
class TopicClassificationModelBuild:
    def __init__(self, config):
        self.max_vocab_len = config.max_vocab_len
        self.embedding_dim = config.embedding_dim
        self.sequence_length = config.sequence_length
        self.batch_size = config.batch_size
        self.epochs = config.epochs
        self.num_topics = config.num_topics
        self.work_dir = config.WORK_DIR
        self.dataset_sentiment = config.DATASET_SENTIMENT
        self.vectorize_layer = layers.TextVectorization(
            max_tokens=config.max_vocab_len,
            output_mode='int',
            output_sequence_length=config.sequence_length
        )

    
    def preprocess_data(self, dataset_path):
        # read CSV into pandas dataframe
        reviews = pd.read_csv(dataset_path, encoding='utf-8')

        # preprocess text with methods defined in preprocessing.py
        data = [preprocessing(custom_comment) for custom_comment in reviews['custom_comment'].to_list()]
        data = np.array(data)

        topics = reviews.columns.values[3:].tolist()
        labels = reviews[topics].values
        labels = np.array(labels)

        # split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
        self.vectorize_layer.adapt(X_train)

        return X_train, X_test, y_train, y_test
    


    def build_lstm_model(self):
        # lstm model with PreProcessing layer
        inputs = layers.Input(shape = (1,), dtype = 'string')
        x = self.vectorize_layer(inputs)
        x = layers.Embedding( 
            input_dim = self.max_vocab_len + 1, # int, the size of our vocabulary, maximum integer index + 1 
            output_dim = self.embedding_dim, # int, dimensions to which each words shall be mapped
            input_length = self.sequence_length, #Length of input sequences
            mask_zero=True #to ignore padding
            )(x)
        x = layers.LSTM(units=64, return_sequences=True)(x)
        x = layers.LSTM(units=32)(x)
        x = layers.Dense(units=32, activation = 'relu')(x)
        x = layers.Dropout(rate=0.25)(x)

        predictions = layers.Dense(units=self.num_topics, activation="sigmoid")(x)

        model = tf.keras.Model(inputs, predictions, name = 'TOPIC_MODEL')
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), #1e-3 = 0.001
            loss=tf.keras.losses.BinaryCrossentropy(),
            # metrics=[tf.keras.metrics.Accuracy(),
            #          tf.keras.metrics.FalseNegatives(),
            metrics=["accuracy"])

        model.summary()
        return model
    

    def train_model(self, model, X_train, y_train, X_test, y_test, verbose=1):
        try:
            history = model.fit(X_train, y_train, 
                                batch_size=self.batch_size,
                                epochs=self.epochs,
                                verbose=verbose)
            
            results = model.evaluate(X_test, y_test)
            print ('Test loss: {0}, Test accuracy: {1}'.format(results[0],results[1]))

            return history
        except Exception as e:
            print(f"An error occurred during model training: {e}")
            return None
    

    
    def run_training(self, dataset_path):
        # Preprocess data
        X_train, X_test, y_train, y_test = self.preprocess_data(dataset_path)

        # Build a model
        model = self.build_model()

        # Train LSTM model
        history = self.train_lstm_model(model, X_train, y_train, X_test, y_test)

        # Return relevant information or results
        return {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'lstm_model': model,
            'training_history': history
        }


# Initialize the model
model = TopicClassificationModelBuild(config=Config)

# Run the entire pipeline
model.run_training(dataset_path = Config.DATASET_TOPIC)
    


### save, load, test

In [42]:
model.save(Config.TOPIC_MODEL_PATH, 'topic_model', save_format='tf')
# lstm_model.save('lstm_model', save_format='tf')

INFO:tensorflow:Assets written to: D:/DEV2/Text-Classification-project//models/topic_model\assets


INFO:tensorflow:Assets written to: D:/DEV2/Text-Classification-project//models/topic_model\assets


In [40]:
new_topic_model = tf.keras.models.load_model(Config.TOPIC_MODEL_PATH)

# Show the model architecture
new_topic_model.summary()

Model: "TOPIC_MODEL"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 100)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 100, 50)           500050    
                                                                 
 lstm (LSTM)                 (None, 100, 64)           29440     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                       

In [177]:
examples = [
    "The driver started idle",
    # "driver is a cheat",
    # "The drivers behaviour is inappropriate",
    # "I had a couple of rides with your service before and they were nice, but this time there wasn't a seatbelt which I believe is totally not OK. and the driver just said Are yoou going or not??",
    # "I lost my laptop!",
    # "driver is a cheat. I lost my laptop!",
    # "bad",
    # "your customer service never replies",
    # "bad smell",
    # "want refund",
]

topics =  ['1 Pricing and Fairness', '2 Driver professionalism', '3 Driver behaviour', '4 Customer Service', '5 Application', '6 Lost things', '7 Vehicle Condition', '8 Safety & reliability', '9 General bad', '10 Other']
preprocessed_examples = []
for example in examples:
    preprocessed_example = preprocessing(example)
    preprocessed_examples.append(preprocessed_example)

print(preprocessed_examples)
model_predictions = new_topic_model.predict(preprocessed_examples)

# Set a threshold for considering a topic present (e.g., 0.5)
threshold = 0.3

# Get the topics where the probabilities are above the threshold for each row
predicted_topics = np.where(model_predictions >= threshold)

# # Output the results
# for i in range(predicted_topics[0].size):
#     print(f"Review {i+1}: {examples[i]}")
#     for topic_index in predicted_topics[1][predicted_topics[0] == i]:
#         print(f"  - Topic {topics[topic_index]}")
#     print("\n")

print(model_predictions)



['driver started idle']
[[9.9873811e-01 2.6529160e-06 6.9647026e-04 6.7480443e-07 2.9194103e-07
  2.7279872e-09 4.1279651e-17 8.4031821e-19 6.7310134e-06 1.6558518e-09]]


In [43]:
# load models
sentiment_model = tf.keras.models.load_model(Config.SENTIMENT_MODEL_PATH)
topic_model = tf.keras.models.load_model(Config.TOPIC_MODEL_PATH)

In [51]:
review = ["really bad  driver"]

In [52]:
topics = ['1 Pricing and Fairness', '2 Driver professionalism', '3 Driver behaviour', '4 Customer Service', '5 Application', '6 Lost things', '7 Vehicle Condition', '8 Safety & reliability', '9 General bad', '10 Other']
    
def predict_classes(predictions):
    predicted_labels2 = []
    for (indx, probability) in enumerate(predictions[0]) :
        if probability>=0.1:
            predicted_labels2.append(indx)
    return [topics[label] for label in predicted_labels2]        


def get_classification(review):
    predicted_topics = []
    for review in review:
        preprocessed_review = preprocessing(review)
        sentiment = sentiment_model.predict([preprocessed_review])
        print(sentiment)
        if sentiment[0] <= 0.5:
            sentiment = "negative"
            # predicted_result.append("Sentiment: negative.\n Predicted topic(s): ")
            # predicted_result.append("negative")
            topic_predictions = topic_model.predict([preprocessed_review]) # in -> [] out ->[]
            predicted_topics = predict_classes(topic_predictions)
            # predicted_result.append(predicted_topics)
            negative_result = f"Sentiment: {[sentiment]}\nPredicted topic(s): {predicted_topics}"
            return negative_result
        else:
            # predicted_result.append("positive")
            sentiment = "positive"
            positive_result = f"Sentiment: {[sentiment]}"
            return positive_result
                

test = get_classification(review)
print(test)
  


[[0.00051582]]
Sentiment: ['negative']
Predicted topic(s): ['3 Driver behaviour']


In [56]:
class Classificator:
    def __init__(self, config):
        # load models
        self.sentiment_model = tf.keras.models.load_model(config.SENTIMENT_MODEL_PATH)
        self.topic_model = tf.keras.models.load_model(config.TOPIC_MODEL_PATH)
        self.topics = config.TOPICS
        
    
    def predict_classes(self, predictions):
        predicted_labels2 = []
        for (indx, probability) in enumerate(predictions[0]):
            if probability >= 0.3:
                predicted_labels2.append(indx)
        return [self.topics[label] for label in predicted_labels2]        


    def get_classification(self, review):
        try:
            preprocessed_review = preprocessing(review)
            sentiment = self.sentiment_model.predict([preprocessed_review])
            
            if sentiment[0] <= 0.5:
                sentiment = "negative"
                topic_predictions = self.topic_model.predict([preprocessed_review]) 
                predicted_topics = self.predict_classes(topic_predictions)
                negative_result = f"Sentiment: {[sentiment]}\nPredicted topic(s): {predicted_topics}"
                return negative_result
            else:
                sentiment = "positive"
                positive_result = f"Sentiment: {[sentiment]}"
                return positive_result                    
        
        except Exception as e:
            return f"Error processing the review: {str(e)}"



user_review = input("Enter a review: ")
classificator = Classificator(Config) 
result = classificator.get_classification(user_review)
print(result)



Sentiment: ['positive']


In [None]:
!pip install gradio

In [1]:
import gradio as gr
# import nltk
# from nltk.sentiment.vader import SentimentIntensityAnalyzer

# nltk.download("vader_lexicon")
# sid = SentimentIntensityAnalyzer()


demo = gr.Interface(
    fn=Classificator, 
    inputs=[gr.Textbox(label="Enter a review here...")], 
    outputs=[gr.Textbox(label="Topic")],
    examples=[["It was wonderful!"]])

demo.launch(share=True)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'Classificator' is not defined