In [1]:

import joblib 
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import *

# Corpus Processing
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk import word_tokenize, sent_tokenize # tokenizing
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\olkos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olkos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### data cleaning & preprocessing:

In [2]:
stopwords = ['a', 'about', 'an', 'am' 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']


short_forms = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "didn't": "did not",
    "doesn't": "does not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "it's": "it is",
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "we're": "we are",
    "they're": "they are",
    "I've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "couldn't": "could not",
    "should've": "should have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    # Add more short forms and their full forms as needed
}

def replace_short_forms(text):
    # Create a regular expression pattern to match short forms as standalone words
    pattern = r'\b(?:{})\b'.format('|'.join(short_forms.keys()), re.IGNORECASE)
    
    # Replace short forms with their corresponding full forms using a lambda function
    full_forms_text = re.sub(pattern, lambda match: short_forms[match.group(0)], text)
    
    return full_forms_text


# (?) remove quotation marks, unnecessary punctuation, [{}[]\/+*%|^%#@!?()]
def punctuation_remover(text):
    pattern = r'[{}\[\]\\\/\+\*%\|\^%#@\(\)\$\"]'
    return re.sub(pattern, ' ', text)

# lemmatizing, tokenization, isalpha, stopwords
def lemma_stopwords_token(text):
      le=WordNetLemmatizer()
      word_tokens=nltk.word_tokenize(text)
      word_tokens =[token for token in word_tokens if token.isalpha()]
      tokens=[le.lemmatize(token) for token in word_tokens if token not in stopwords and len(token)>2]
      processed_text =" ".join(tokens)
      return processed_text

In [3]:
# main preprocessing function
def preprocessing(text):
    reviews = replace_short_forms(text)
    reviews = punctuation_remover(reviews)
    reviews = lemma_stopwords_token(reviews)
    return reviews

#### loading models

In [4]:
# Load the saved sentiment model
sentiment_model = joblib.load('model_sentiment_Naive_Bayes_SMOTE.joblib')

# Load the saved topic classification model
topic_classification_model = joblib.load('model_topic_classification.joblib')

In [11]:
# loading dataset, preprocessing, vectorizer
reviews = pd.read_csv("annotated_data_sentiment.csv", encoding='utf-8')
data = reviews['custom_comment'].apply(preprocessing)
label = reviews['sentiment']

vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data)

#### classification method

In [12]:
def classification(text):
 
  topics = ['1 Pricing and Fairness', '2 Driver professionalism', '3 Driver behaviour', '4 Customer Service', '5 Application', '6 Lost things', '7 Vehicle Condition', '8 Safety & reliability', '9 General bad', '10 Other']
  predicted_result = []
  
  preprocessed_text = preprocessing(text)
  features = vectorizer.transform([preprocessed_text])
  prediction_sentiment = sentiment_model.predict(features)

  if prediction_sentiment[0] == -1:
    predicted_result.append("Sentiment: negative.\nPredicted topic(s): ")
    prediction = topic_classification_model.predict([preprocessed_text])
    for i, topic in enumerate(topics):
      if prediction[0][i] == 1:  
        # predicted_result.append(topic) 
        predicted_result.append(''.join(topic))
  else:
    predicted_result.append("Sentiment: positive")

  return ' '.join(predicted_result) 
    

In [24]:
reviews_example = ["""The driver didn't speak much and it was really good""",
                   """Nice driver, really helpful""",
                """vehicle of premium class""",
                """Great experience.I'd recommend this service to my friends."""

                """The driver started idle""",
                """want refund""",
                """driver didn't come to our location and switched on idle""",
                """The driver swore and racist""",
                """I had a couple of rides with your service before and they were nice, but this time there wasn't a seatbelt which I believe is totally not OK. and the driver just said Are yoou going or not??""",
                """left  my phone in the car!""",
                """I lost my wallet! and your customer support never answered my message""",
                """your customer service never replies""",
                """your app doesn't let me choose 2 locations""",
                """driver is a cheat. I lost my laptop!""",
                """bad smell in the car. dirty""",
                """bad""",
                """it was cold in the car""",
                """too fast driving style.""",
                """no seat belt at the back seat."""        
                ]

for i, review in enumerate(reviews_example):
  # print(f"{review[0]}. {prepare_new_reviews(review[1])}")
  print(f"{i}. {review}")
  print(classification(review))
  print("")

0. The driver didn't speak much and it was really good
Sentiment: positive

1. Nice driver, really helpful
Sentiment: positive

2. vehicle of premium class
Sentiment: positive

3. Great experience.I'd recommend this service to my friends.The driver started idle
Sentiment: positive

4. want refund
Sentiment: negative.
Predicted topic(s):  1 Pricing and Fairness

5. driver didn't come to our location and switched on idle
Sentiment: negative.
Predicted topic(s):  1 Pricing and Fairness 2 Driver professionalism

6. The driver swore and racist
Sentiment: negative.
Predicted topic(s):  3 Driver behaviour

7. I had a couple of rides with your service before and they were nice, but this time there wasn't a seatbelt which I believe is totally not OK. and the driver just said Are yoou going or not??
Sentiment: negative.
Predicted topic(s):  3 Driver behaviour

8. left  my phone in the car!
Sentiment: negative.
Predicted topic(s):  6 Lost things

9. I lost my wallet! and your customer support nev

#### classification method for gradio visualisation

In [23]:

def getClassification(text):
  sentiment_model = joblib.load('model_sentiment_Naive_Bayes.joblib')
  topic_classification_model = joblib.load('model_topic_classification.joblib')
  topics = ['1 Pricing and Fairness', '2 Driver professionalism', '3 Driver behaviour', '4 Customer Service', '5 Application', '6 Lost things', '7 Vehicle Condition', '8 Safety & reliability', '9 General bad', '10 Other']
  

  predicted_topics = []
  preprocessed_text = preprocessing(text)
  prediction_sentiment = sentiment_model.predict([preprocessed_text])
  result = []
  if prediction_sentiment[0] == -1:
    result.append("negative")
    prediction = topic_classification_model.predict([preprocessed_text])
    for i, topic in enumerate(topics):
      if prediction[0][i] == 1:
        predicted_topics.append(topic)
    result.extend(predicted_topics)
  else :
    result.append("positive")
  
  return ", ".join(result)

In [None]:
!pip install transformers
!pip install gradio

In [24]:
import gradio as gr
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")
# sid = SentimentIntensityAnalyzer()



demo = gr.Interface(
    fn=getClassification, 
    inputs=gr.Textbox(placeholder="Enter a review here..."), 
    outputs=["label"], 
    interpretation="default",
    examples=[["It was wonderful!"]])

demo.launch(share=True)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\olkos\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://a79a2b281fc2a70025.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


