In [None]:
# data visualisation and manipulation
import pandas as pd #import the Pandas library to read the CSV and save it in a data frame
import numpy  as np

# Visualization and Analysis
import matplotlib.pyplot  as plt
import matplotlib.cm      as cm
import seaborn            as sns
from sklearn.metrics import silhouette_samples, silhouette_score
from wordcloud import WordCloud


# Corpus Processing
import re
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

from nltk import word_tokenize, sent_tokenize # tokenizing
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
from nltk.corpus import stopwords  #stopwords
import emoji

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing            import normalize

# (?) stop-words: this list includes 'not'
# stop_words=set(nltk.corpus.stopwords.words('english'))

In [2]:
# loading dataset
reviews = pd.read_csv('../datasets/reviews_taxi_data - data.csv', encoding='utf-8')
reviews.head(5)

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment
0,0d47c51b-a1df-4901-97eb-58ee193d9655,b89cf504-f2fb-4ad4-a113-1112f6da5265,9d9e0f25-a383-444e-afe7-01ed085d216f,d6c92134-8f65-4da4-b6ab-a2a5c4bbaf9f,2023-03-01 01:49:32,1,very not polite
1,ae23c8d2-496a-47a8-8fb4-bd48f3e6f342,93748ca6-e77a-46e9-8460-f9c65b3781f6,576e7518-c1af-4697-9182-4251a488cbec,a635ef51-6da4-446a-9161-45d0a48ec2d2,2023-03-01 03:40:14,3,not a pleasant interaction overall. The driver...
2,def53af0-8c66-4759-ac16-7143b9e116d5,7edf5cf5-aa9f-4be6-8ca7-cd03c1aa66c0,e24c35a0-1cbf-43a1-9cae-171a89d1e7c1,57c5de49-2a44-46f8-800c-71a39490ea92,2023-03-01 06:46:58,1,Driver is a theif. Our travel cost is 154 and ...
3,365c9450-344e-4608-8abb-a3e8b6ac7019,32e24028-cb9e-4499-bb48-142a0920b61b,b106a159-2d28-4019-bc7f-1feb31d0536e,254a79db-0421-4c3a-afae-3d93fd5fd5ac,2023-03-01 07:44:43,1,he was rude and did not drop me to my destination
4,52a81772-00cb-4ef0-8935-53ce4cd4c5e2,4372a23a-e506-4918-8006-71238e7eefa4,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-01 07:55:51,1,Drove recklessly. Said he was a race driver. W...


#### **data cleaning & preprocessing:**

In [3]:
stopwords = ['a', 'about', 'an', 'am' 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [5]:
short_forms = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "didn't": "did not",
    "doesn't": "does not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "it's": "it is",
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "we're": "we are",
    "they're": "they are",
    "I've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "couldn't": "could not",
    "should've": "should have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    # Add more short forms and their full forms as needed
}

def replace_short_forms(text):
    # Create a regular expression pattern to match short forms as standalone words
    pattern = r'\b(?:{})\b'.format('|'.join(short_forms.keys()), re.IGNORECASE)
    
    # Replace short forms with their corresponding full forms using a lambda function
    full_forms_text = re.sub(pattern, lambda match: short_forms[match.group(0)], text)
    
    return full_forms_text


# (?) remove quotation marks, unnecessary punctuation, [{}[]\/+*%|^%#@!?()]
def punctuation_remover(text):
    pattern = r'[{}\[\]\\\/\+\*%\|\^%#@\(\)\$\"]'
    return re.sub(pattern, ' ', text)


In [6]:
def lemma_stopwords_token(text):
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(text)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stopwords and len(w)>2]
      processed_text =" ".join(tokens)
      return processed_text

In [7]:
# main preprocessing function
def text_preprocessing(text):
    reviews = replace_short_forms(text)
    reviews = punctuation_remover(reviews)
    reviews = lemma_stopwords_token(reviews)
    return reviews
    

In [8]:
reviews['clean_custom_comment']=reviews['custom_comment'].apply(text_preprocessing)
reviews.head(5)

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment,clean_custom_comment
0,0d47c51b-a1df-4901-97eb-58ee193d9655,b89cf504-f2fb-4ad4-a113-1112f6da5265,9d9e0f25-a383-444e-afe7-01ed085d216f,d6c92134-8f65-4da4-b6ab-a2a5c4bbaf9f,2023-03-01 01:49:32,1,very not polite,very not polite
1,ae23c8d2-496a-47a8-8fb4-bd48f3e6f342,93748ca6-e77a-46e9-8460-f9c65b3781f6,576e7518-c1af-4697-9182-4251a488cbec,a635ef51-6da4-446a-9161-45d0a48ec2d2,2023-03-01 03:40:14,3,not a pleasant interaction overall. The driver...,not pleasant interaction overall The driver se...
2,def53af0-8c66-4759-ac16-7143b9e116d5,7edf5cf5-aa9f-4be6-8ca7-cd03c1aa66c0,e24c35a0-1cbf-43a1-9cae-171a89d1e7c1,57c5de49-2a44-46f8-800c-71a39490ea92,2023-03-01 06:46:58,1,Driver is a theif. Our travel cost is 154 and ...,Driver theif Our travel cost 154 and gave 200....
3,365c9450-344e-4608-8abb-a3e8b6ac7019,32e24028-cb9e-4499-bb48-142a0920b61b,b106a159-2d28-4019-bc7f-1feb31d0536e,254a79db-0421-4c3a-afae-3d93fd5fd5ac,2023-03-01 07:44:43,1,he was rude and did not drop me to my destination,rude and did not drop destination
4,52a81772-00cb-4ef0-8935-53ce4cd4c5e2,4372a23a-e506-4918-8006-71238e7eefa4,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-01 07:55:51,1,Drove recklessly. Said he was a race driver. W...,Drove recklessly Said race driver Was very rud...


#### **Extract TF-IDF Features**

In [9]:
vect =TfidfVectorizer() # to play with. min_df,max_df,max_features etc...
vect_text=vect.fit_transform(reviews['clean_custom_comment'])

In [10]:
print(vect_text.shape)
print(vect_text)

(6357, 6230)
  (0, 4135)	0.794955699505046
  (0, 3760)	0.4113800958951702
  (0, 5943)	0.4458832274550409
  (1, 5585)	0.221906304257886
  (1, 4731)	0.22117670187560073
  (1, 4867)	0.4457453683951472
  (1, 1892)	0.12384116805661322
  (1, 5516)	0.20202624202082675
  (1, 3907)	0.441664500578453
  (1, 3035)	0.5293219792910454
  (1, 4113)	0.379153114391543
  (1, 3760)	0.16730784463681878
  (2, 1178)	0.18520334420862716
  (2, 785)	0.18395176875021857
  (2, 2496)	0.19286028029732474
  (2, 1741)	0.23186543914614402
  (2, 2319)	0.37434588619868786
  (2, 2832)	0.3226082420968256
  (2, 116)	0.26841648029117604
  (2, 2467)	0.22411783599712293
  (2, 556)	0.08205350157705403
  (2, 77)	0.37434588619868786
  (2, 1492)	0.23388041581208213
  (2, 5679)	0.27531561843256436
  (2, 3890)	0.20737260167381058
  :	:
  (6356, 4315)	0.17093860582236683
  (6356, 1054)	0.12062695316329385
  (6356, 3300)	0.18551621341403726
  (6356, 5897)	0.13322778402113156
  (6356, 901)	0.12062695316329385
  (6356, 1083)	0.12979447

**K-Means**
#### Function that runs the K-Means algorithm max_k times and returns a dictionary of each k result

In [11]:
from sklearn.cluster import KMeans 

In [12]:
k=10
kmeans_model = KMeans(n_clusters=k, random_state=0)
kmeans_model.fit(vect_text)

In [13]:
reviews['cluster']= kmeans_model.labels_
reviews.head()

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment,clean_custom_comment,cluster
0,0d47c51b-a1df-4901-97eb-58ee193d9655,b89cf504-f2fb-4ad4-a113-1112f6da5265,9d9e0f25-a383-444e-afe7-01ed085d216f,d6c92134-8f65-4da4-b6ab-a2a5c4bbaf9f,2023-03-01 01:49:32,1,very not polite,very not polite,4
1,ae23c8d2-496a-47a8-8fb4-bd48f3e6f342,93748ca6-e77a-46e9-8460-f9c65b3781f6,576e7518-c1af-4697-9182-4251a488cbec,a635ef51-6da4-446a-9161-45d0a48ec2d2,2023-03-01 03:40:14,3,not a pleasant interaction overall. The driver...,not pleasant interaction overall The driver se...,1
2,def53af0-8c66-4759-ac16-7143b9e116d5,7edf5cf5-aa9f-4be6-8ca7-cd03c1aa66c0,e24c35a0-1cbf-43a1-9cae-171a89d1e7c1,57c5de49-2a44-46f8-800c-71a39490ea92,2023-03-01 06:46:58,1,Driver is a theif. Our travel cost is 154 and ...,Driver theif Our travel cost 154 and gave 200....,2
3,365c9450-344e-4608-8abb-a3e8b6ac7019,32e24028-cb9e-4499-bb48-142a0920b61b,b106a159-2d28-4019-bc7f-1feb31d0536e,254a79db-0421-4c3a-afae-3d93fd5fd5ac,2023-03-01 07:44:43,1,he was rude and did not drop me to my destination,rude and did not drop destination,1
4,52a81772-00cb-4ef0-8935-53ce4cd4c5e2,4372a23a-e506-4918-8006-71238e7eefa4,76311bc2-abbf-4c92-b07c-4716ad41c1ee,cee9eab5-9dc8-4565-b795-b4467059db7d,2023-03-01 07:55:51,1,Drove recklessly. Said he was a race driver. W...,Drove recklessly Said race driver Was very rud...,2


In [14]:
# creating csv file with clusters
reviews.to_csv('../datasets/reviews_taxi_data-clustered.csv', index=False, encoding='utf-8')   

In [15]:
# output the result to a text file.

clusters = reviews.groupby('cluster')    

# Reorder the DataFrame based on the numeric column within each group
reviews_sorted = pd.concat([clusters.get_group(x) for x in clusters.groups])
reviews_sorted.to_csv('../datasets/cluster_reorder.csv', index=False, encoding='utf-8')


In [16]:
# loading dataset
clustered_reviews = pd.read_csv('../datasets/cluster_reorder.csv', encoding='utf-8')
clustered_reviews.head(5)

Unnamed: 0,order_id,feedback_id,rider_id,driver_id,created_at,stars,custom_comment,clean_custom_comment,cluster
0,18491c39-be20-4870-9ecd-fa1067b4177f,d11c40de-03f7-4e11-8e0d-ded9873e16b10,a2d80ba0-e3fc-4ca0-9758-b65ffef020bc,d5424622-4cca-4887-b12a-5479e648965a,2023-03-01 16:09:04,1,Advertising other services,Advertising other service,0
1,f22e9d1c-e0fd-47b1-83f8-435a915469b2,5f083470-cb3e-4da3-8a8d-81ca467a725d,2e202b30-9611-44f1-82f9-d804f07e4a09,808f6bdf-df81-4849-bd94-027ce22bec46,2023-03-02 05:07:40,1,Advertising other services,Advertising other service,0
2,0d47c51b-a1df-4901-97eb-58ee193d9660,3d2f77a0-c280-47cf-8d9e-c8324ad73cc12,948bbfde-11fa-419e-b417-829d07e54ad8,b74b3945-4b6a-4277-9d51-22ff2bb8dbe9,2023-03-03 00:06:35,1,Advertising other services,Advertising other service,0
3,52a81772-00cb-4ef0-8935-53ce4cd4c5e7,0422465c-7758-4f39-bcfb-3b019f31be9b,19450c20-2c4e-496e-a57f-cdca3e9ab379,d848d4aa-8312-4812-93d0-2fe9d74f81ad,2023-03-03 11:56:13,2,Advertising other services,Advertising other service,0
4,1a99c30a-7302-4603-bd6b-d6625de64914,4ada99a0-10c5-4c05-ac63-c6e249bebfe3,a908365a-8515-48de-951e-0d55ae2971c9,f815240b-ba8c-44ac-86a5-6a71f13f5bde,2023-03-04 11:24:57,3,Advertising other services,Advertising other service,0


In [18]:
print("Cluster centroids: \n")
order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
terms = vect.get_feature_names_out()

for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 advertising
 other
 service
 ﾟヮﾟ
 europe
 etap
 etc
 ethic
 ethical
 ethically
------------
Cluster 1:
 not
 and
 utaxi
 did
 driver
 time
 ride
 the
 trip
 use
------------
Cluster 2:
 driver
 and
 didn
 driving
 the
 trip
 late
 money
 perfect
 him
------------
Cluster 3:
 best
 the
 driver
 one
 trip
 person
 city
 utaxi
 service
 ride
------------
Cluster 4:
 very
 bad
 nice
 driver
 and
 behavior
 behaviour
 friendly
 polite
 man
------------
Cluster 5:
 great
 service
 thanks
 driver
 guy
 trip
 and
 music
 car
 very
------------
Cluster 6:
 rude
 driver
 very
 and
 the
 was
 racist
 arrogant
 shouting
 angry
------------
Cluster 7:
 good
 nice
 very
 driver
 and
 really
 music
 english
 trip
 car
------------
Cluster 8:
 wrong
 address
 location
 place
 dropped
 and
 went
 arrived
 took
 driver
------------
Cluster 9:
 car
 smell
 driver
 and
 the
 bad
 inside
 clean
 nice
 very
------------
