In [1]:
# data visualisation and manipulation
import pandas as pd #import the Pandas library to read the CSV and save it in a data frame
import numpy  as np

# Visualization and Analysis
import matplotlib.pyplot  as plt
import matplotlib.cm      as cm
import seaborn            as sns
from sklearn.metrics import silhouette_samples, silhouette_score
from wordcloud import WordCloud


# Corpus Processing
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk import word_tokenize, sent_tokenize # tokenizing
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
from nltk.corpus import stopwords  #stopwords
import emoji

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing            import normalize

# (?) stop-words: this list includes 'not'
# stop_words=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olkos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\olkos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olkos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# loading dataset
reviews = pd.read_csv("reviews_taxi_data - data.csv", encoding='utf-8')
reviews.head(5)

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment
0,0d47c51b-a1df-4901-97eb-58ee193d9655,b89cf504-f2fb-4ad4-a113-1112f6da5265,9d9e0f25-a383-444e-afe7-01ed085d216f,d6c92134-8f65-4da4-b6ab-a2a5c4bbaf9f,2023-03-01 01:49:32,1,very not polite
1,ae23c8d2-496a-47a8-8fb4-bd48f3e6f342,93748ca6-e77a-46e9-8460-f9c65b3781f6,576e7518-c1af-4697-9182-4251a488cbec,a635ef51-6da4-446a-9161-45d0a48ec2d2,2023-03-01 03:40:14,3,not a pleasant interaction overall. The driver...
2,def53af0-8c66-4759-ac16-7143b9e116d5,7edf5cf5-aa9f-4be6-8ca7-cd03c1aa66c0,e24c35a0-1cbf-43a1-9cae-171a89d1e7c1,57c5de49-2a44-46f8-800c-71a39490ea92,2023-03-01 06:46:58,1,Driver is a theif. Our travel cost is 154 and ...
3,365c9450-344e-4608-8abb-a3e8b6ac7019,32e24028-cb9e-4499-bb48-142a0920b61b,b106a159-2d28-4019-bc7f-1feb31d0536e,254a79db-0421-4c3a-afae-3d93fd5fd5ac,2023-03-01 07:44:43,1,he was rude and did not drop me to my destination
4,52a81772-00cb-4ef0-8935-53ce4cd4c5e2,4372a23a-e506-4918-8006-71238e7eefa4,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-01 07:55:51,1,Drove recklessly. Said he was a race driver. W...


**data cleaning & preprocessing:**

In [3]:
stopwords = ['a', 'about', 'an', 'am' 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [4]:
short_forms = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "didn't": "did not",
    "doesn't": "does not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "it's": "it is",
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "we're": "we are",
    "they're": "they are",
    "I've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "couldn't": "could not",
    "should've": "should have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    # Add more short forms and their full forms as needed
}

def replace_short_forms(text):
    # Create a regular expression pattern to match short forms as standalone words
    pattern = r'\b(?:{})\b'.format('|'.join(short_forms.keys()), re.IGNORECASE)
    
    # Replace short forms with their corresponding full forms using a lambda function
    full_forms_text = re.sub(pattern, lambda match: short_forms[match.group(0)], text)
    
    return full_forms_text


# (?) remove quotation marks, unnecessary punctuation, [{}[]\/+*%|^%#@!?()]
def punctuation_remover(text):
    pattern = r'[{}\[\]\\\/\+\*%\|\^%#@\(\)\$\"]'
    return re.sub(pattern, ' ', text)


In [5]:
def lemma_stopwords_token(text):
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(text)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stopwords and len(w)>2]
      processed_text =" ".join(tokens)
      return processed_text

In [7]:
# main preprocessing function
def text_preprocessing(text):
    reviews = replace_short_forms(text)
    reviews = punctuation_remover(reviews)
    reviews = lemma_stopwords_token(reviews)
    return reviews
    

In [9]:
reviews['clean_custom_comment']=reviews['custom_comment'].apply(text_preprocessing)
# print(reviews['clean_custom_comment'])
reviews.head(5)

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment,clean_custom_comment
0,0d47c51b-a1df-4901-97eb-58ee193d9655,b89cf504-f2fb-4ad4-a113-1112f6da5265,9d9e0f25-a383-444e-afe7-01ed085d216f,d6c92134-8f65-4da4-b6ab-a2a5c4bbaf9f,2023-03-01 01:49:32,1,very not polite,very not polite
1,ae23c8d2-496a-47a8-8fb4-bd48f3e6f342,93748ca6-e77a-46e9-8460-f9c65b3781f6,576e7518-c1af-4697-9182-4251a488cbec,a635ef51-6da4-446a-9161-45d0a48ec2d2,2023-03-01 03:40:14,3,not a pleasant interaction overall. The driver...,not pleasant interaction overall The driver se...
2,def53af0-8c66-4759-ac16-7143b9e116d5,7edf5cf5-aa9f-4be6-8ca7-cd03c1aa66c0,e24c35a0-1cbf-43a1-9cae-171a89d1e7c1,57c5de49-2a44-46f8-800c-71a39490ea92,2023-03-01 06:46:58,1,Driver is a theif. Our travel cost is 154 and ...,Driver theif Our travel cost 154 and gave 200....
3,365c9450-344e-4608-8abb-a3e8b6ac7019,32e24028-cb9e-4499-bb48-142a0920b61b,b106a159-2d28-4019-bc7f-1feb31d0536e,254a79db-0421-4c3a-afae-3d93fd5fd5ac,2023-03-01 07:44:43,1,he was rude and did not drop me to my destination,rude and did not drop destination
4,52a81772-00cb-4ef0-8935-53ce4cd4c5e2,4372a23a-e506-4918-8006-71238e7eefa4,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-01 07:55:51,1,Drove recklessly. Said he was a race driver. W...,Drove recklessly Said race driver Was very rud...


**Extract TF-IDF Features**

In [11]:
vect =TfidfVectorizer() # to play with. min_df,max_df,max_features etc...
vect_text=vect.fit_transform(reviews['clean_custom_comment'])

In [None]:
print(vect_text.shape)
print(vect_text)

**K-Means**
###### Function that runs the K-Means algorithm max_k times and returns a dictionary of each k result

In [12]:
from sklearn.cluster import KMeans 

In [13]:
k=10
kmeans_model = KMeans(n_clusters=k, random_state=0)
kmeans_model.fit(vect_text)

In [14]:
reviews['cluster']= kmeans_model.labels_
reviews.head()

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment,clean_custom_comment,cluster
0,0d47c51b-a1df-4901-97eb-58ee193d9655,b89cf504-f2fb-4ad4-a113-1112f6da5265,9d9e0f25-a383-444e-afe7-01ed085d216f,d6c92134-8f65-4da4-b6ab-a2a5c4bbaf9f,2023-03-01 01:49:32,1,very not polite,very not polite,4
1,ae23c8d2-496a-47a8-8fb4-bd48f3e6f342,93748ca6-e77a-46e9-8460-f9c65b3781f6,576e7518-c1af-4697-9182-4251a488cbec,a635ef51-6da4-446a-9161-45d0a48ec2d2,2023-03-01 03:40:14,3,not a pleasant interaction overall. The driver...,not pleasant interaction overall The driver se...,4
2,def53af0-8c66-4759-ac16-7143b9e116d5,7edf5cf5-aa9f-4be6-8ca7-cd03c1aa66c0,e24c35a0-1cbf-43a1-9cae-171a89d1e7c1,57c5de49-2a44-46f8-800c-71a39490ea92,2023-03-01 06:46:58,1,Driver is a theif. Our travel cost is 154 and ...,Driver theif Our travel cost 154 and gave 200....,2
3,365c9450-344e-4608-8abb-a3e8b6ac7019,32e24028-cb9e-4499-bb48-142a0920b61b,b106a159-2d28-4019-bc7f-1feb31d0536e,254a79db-0421-4c3a-afae-3d93fd5fd5ac,2023-03-01 07:44:43,1,he was rude and did not drop me to my destination,rude and did not drop destination,4
4,52a81772-00cb-4ef0-8935-53ce4cd4c5e2,4372a23a-e506-4918-8006-71238e7eefa4,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-01 07:55:51,1,Drove recklessly. Said he was a race driver. W...,Drove recklessly Said race driver Was very rud...,2


In [15]:
# creating csv file with clusters
reviews.to_csv('reviews_taxi_data-clustered.csv', index=False, encoding='utf-8')   

In [21]:
# output the result to a text file.

clusters = reviews.groupby('cluster')    

# Reorder the DataFrame based on the numeric column within each group
reviews_sorted = pd.concat([clusters.get_group(x) for x in clusters.groups])
reviews_sorted.to_csv('cluster_reorder.csv', index=False, encoding='utf-8')


In [22]:
# loading dataset
clustered_reviews = pd.read_csv("cluster_reorder.csv", encoding='utf-8')
clustered_reviews.head(5)

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment,clean_custom_comment,cluster
0,52a81772-00cb-4ef0-8935-53ce4cd4c5e5,4372a23a-e506-4918-8006-71238e7eefa7,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-02 13:07:41,1,"Very bad behaviour,took me to go deliver somet...",Very bad behaviour took deliver something frie...,0
1,0d47c51b-a1df-4901-97eb-58ee193d9662,7564051c-3c68-4b5c-bd98-6b5fbbd441f6,b40ff92e-f705-4763-a202-58bde64b22c7,9959e558-5d5e-4aa8-9140-e0d0339fdad2,2023-03-04 00:24:07,1,His behaviour is so rude and bad,His behaviour rude and bad,0
2,52a81772-00cb-4ef0-8935-53ce4cd4c5e9,a34de6a9-5f8d-460d-9758-bd7d2cd5aece,d03f506c-6d2c-4481-93a4-c720451ee1c7,d8a2b73e-a5a7-4021-ad43-e1de9fe50f4f,2023-03-04 07:19:28,2,Very bad,Very bad,0
3,71b8b974-ff3f-4033-9bc5-6fa48943f393,a5bbc402-caec-4f71-8e93-8f1407cffe3d,2c861293-e7bc-42f0-92b6-71cffc21ec33,cba6965b-240c-4018-8ecf-2a0861a7523d,2023-03-04 15:32:44,2,Very bad and driver lied too much,Very bad and driver lied too much,0
4,365c9450-344e-4608-8abb-a3e8b6ac7030,be8757eb-1e99-4ea1-9791-83aa80fcfd73,2cb2ec40-0856-4f62-ada0-b2abb33a2c05,0f45528a-920e-42a0-9565-bee6af4fd750,2023-03-06 13:57:38,2,he was so impolite and has bad behavior,impolite and bad behavior,0


In [23]:
print("Cluster centroids: \n")
order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
terms = vect.get_feature_names()

for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 bad
 very
 behavior
 behaviour
 and
 driver
 car
 smell
 attitude
 driving
------------
Cluster 1:
 good
 best
 driver
 very
 the
 and
 trip
 english
 one
 music
------------
Cluster 2:
 driver
 and
 very
 car
 driving
 the
 late
 trip
 too
 perfect
------------
Cluster 3:
 rude
 driver
 very
 and
 the
 was
 racist
 arrogant
 shouting
 angry
------------
Cluster 4:
 not
 and
 utaxi
 did
 driver
 time
 ride
 the
 car
 all
------------
Cluster 5:
 advertising
 other
 service
 ﾟヮﾟ
 europe
 etap
 etc
 ethic
 ethical
 ethically
------------
Cluster 6:
 wrong
 address
 location
 place
 dropped
 and
 went
 arrived
 took
 driver
------------
Cluster 7:
 nice
 very
 driver
 and
 good
 really
 car
 music
 man
 guy
------------
Cluster 8:
 great
 service
 driver
 thanks
 guy
 trip
 and
 car
 music
 very
------------
Cluster 9:
 didn
 and
 driver
 change
 give
 rude
 idle
 location
 him
 money
------------


