In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!nvidia-smi

Mon Jan 15 19:03:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
# installing dependencies
!pip install transformers requests beautifulsoup4 pandas numpy tensorflow tqdm



In [5]:
# importing libraries
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf

In [6]:
# importing dataset
df = pd.read_csv('/content/drive/MyDrive/d/consumer_reviews.csv', encoding='latin-1')

---
#Data Preprocessing
---

In [7]:
df.head(10)

Unnamed: 0,reviewer_id,store_location,latitude,longitude,date,month,year,title,review,review-label
0,0.0,US,37.09024,-95.712891,2023,6,2015 00:00:00,Great help with lost order,I had an order that was lost in transit. When ...,5
1,1.0,US,37.09024,-95.712891,2023,6,2024 00:00:00,I ordered the wrong size tee and hadï¿½ï¿½ï¿½,I ordered the wrong size tee and had difficult...,5
2,2.0,US,37.09024,-95.712891,2023,6,2017 00:00:00,These guys offer the best customerï¿½ï¿½ï¿½,These guys offer the best customer service in ...,5
3,3.0,US,37.09024,-95.712891,2023,6,2024 00:00:00,Good Stuff,Looked for an obscure phrase on a shirt. Teepu...,5
4,4.0,CA,56.130366,-106.346771,2023,6,2023 00:00:00,My order arrived in a good timelyï¿½ï¿½ï¿½,My order arrived in a good timely fashion & th...,4
5,5.0,US,37.09024,-95.712891,2023,6,2015 00:00:00,Always top notch,Always top notch customer service. Never have ...,5
6,6.0,US,37.09024,-95.712891,2023,6,2019 00:00:00,Recent review,I have messaged sellers and get no response at...,4
7,7.0,US,37.09024,-95.712891,2023,6,2023 00:00:00,Great communication,Great communication. They let me know it was a...,5
8,8.0,CA,56.130366,-106.346771,2023,6,2021 00:00:00,Awesome,"Very impressed with the quality, I had a hard ...",5
9,9.0,US,37.09024,-95.712891,2023,6,2014 00:00:00,Wonderful quality T-shirts for anï¿½ï¿½ï¿½,Wonderful quality T-shirts for an amazing pric...,5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278100 entries, 0 to 278099
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewer_id     278099 non-null  float64
 1   store_location  278100 non-null  object 
 2   latitude        278100 non-null  float64
 3   longitude       278100 non-null  float64
 4   date            278100 non-null  int64  
 5   month           278100 non-null  int64  
 6   year            278100 non-null  object 
 7   title           278091 non-null  object 
 8   review          247597 non-null  object 
 9   review-label    278100 non-null  int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 21.2+ MB


In [9]:
# columns to drop:
# reviewer_id
# store_location
# latitude
# longitude
# date
# month
# year
# title

In [10]:
# dropping columns
columns_to_drop = ['reviewer_id', 'store_location', 'latitude', 'longitude', 'date', 'month', 'year', 'title']
df.drop(columns=columns_to_drop, inplace=True)

In [11]:
# renaming the column 'review-label' to 'sentiment_score'
df.rename(columns={'review-label': 'sentiment_score'}, inplace=True)

In [12]:
df.head(3)

Unnamed: 0,review,sentiment_score
0,I had an order that was lost in transit. When ...,5
1,I ordered the wrong size tee and had difficult...,5
2,These guys offer the best customer service in ...,5


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278100 entries, 0 to 278099
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   review           247597 non-null  object
 1   sentiment_score  278100 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.2+ MB


In [14]:
df['review'].value_counts()

Great service                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              27
Great quality                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [15]:
df['review'].unique()

array(['I had an order that was lost in transit. When I called for help the customer service representative got a new order placed right away. Iï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ve received the new order. This w',
       'I ordered the wrong size tee and had difficulties returning it on the website. I contacted Tee Public and they responded almost immediately. Resolved my issue and sent me a new tee shirt. High praise for the excellent customer service!',
       "These guys offer the best customer service in all of retail! Their products are off high quality and there's a fantastic selection of design and product options.",
       ..., 'Dudes rock. Seriously.',
       'Shipping was fast the T-shirt was just right good company for suicide boys',
       'Print of t shirt was blurry and appeared faded. Thegraphic was not scaled well for the shirt and might have been aligned dead center but optically did not look centered. The sizing of the shirt was not great. Overall I was not happy with the sh

In [16]:
# handling null values of reviews
df['review'].isnull().sum()

30503

In [17]:
df.dropna(subset=['review'], inplace=True)

In [18]:
df['review'].isnull().sum()

0

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247597 entries, 0 to 278099
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   review           247597 non-null  object
 1   sentiment_score  247597 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 5.7+ MB


In [20]:
df['sentiment_score'].value_counts()

5    181431
4     22946
1     16746
3     15743
2     10731
Name: sentiment_score, dtype: int64

In [21]:
df['sentiment_score'].isnull().sum()

0

In [22]:
# re-balancing the dataset
df[df['sentiment_score'] == 1]['review']

23        I did not receive my purchase and when I reach...
73        What started out with the prospects of being a...
125       2 out of the 4 T Shirts I received were the wr...
277       The print on the was not straight and the qual...
289       No returns accepted - tshirt color and size no...
                                ...                        
278072    They printed the image on the stomach of the s...
278075    T-shirt was too small and arrived late despite...
278077    Both shirts did not fit true to size and was v...
278083    I bought 4 t-shirts from teepublic. They haven...
278092    got no product that i buy till now, no shipmen...
Name: review, Length: 16746, dtype: object

In [23]:
df[df['sentiment_score'] == 2]['review']

47        The tee that I originally ordered was at least...
60        Subpar communication after a horrible quality ...
133       To long of wait to get shirt shirt not of comf...
189       1out of 2tshirts looks good the other doesnï¿½...
197       The t-shirt print yellowed after 1 day being i...
                                ...                        
278054    I recently reviewed TeePublic about my order b...
278079    The advertised image and the product did not r...
278081    The shirt Is soft and fits well but the print ...
278093    These shirts literally started peeling after i...
278099    Print of t shirt was blurry and appeared faded...
Name: review, Length: 10731, dtype: object

In [24]:
df[df['sentiment_score'] == 3]['review']

10        I love the shirts. I wanted womens. I got mens...
16        The timeliness of turnaround is great with Tee...
31        The white fabric part of this shirt is quite s...
39        The screen prints look good however 2 out of t...
41        Fast shipping, great quality, but the sizing w...
                                ...                        
278019    Shipping cost is expensive and it took quite s...
278021    Product was great but sending it the cheapest ...
278037    My order arrived quickly, but the hoodie and t...
278049                  not there long enough to fully rate
278050    The orders take the maximum amount of days pro...
Name: review, Length: 15743, dtype: object

In [25]:
df[df['sentiment_score'] == 4]['review']

4         My order arrived in a good timely fashion & th...
6         I have messaged sellers and get no response at...
13        Customer service was first rate but the sizes ...
15        I loved the product! The T-shirtï¿½ï¿½ï¿½ï¿½ï¿...
18        Overall the merchandise was great but on one o...
                                ...                        
278062    Pretty good, but the interface could be a lot ...
278067             Great shirts! Very comfortable and soft!
278082               Good service and items arrived on time
278087    Great company and awesome the fact you support...
278088    Like the print quality of the design. However,...
Name: review, Length: 22946, dtype: object

In [26]:
df[df['sentiment_score'] == 5]['review']

0         I had an order that was lost in transit. When ...
1         I ordered the wrong size tee and had difficult...
2         These guys offer the best customer service in ...
3         Looked for an obscure phrase on a shirt. Teepu...
5         Always top notch customer service. Never have ...
                                ...                        
278094    Great site. Supports independent artists while...
278095    Delivery is quick, arrives when promised. Shir...
278096       Great quality, fast delivery, would recommend!
278097                               Dudes rock. Seriously.
278098    Shipping was fast the T-shirt was just right g...
Name: review, Length: 181431, dtype: object

In [27]:
# dropping encoding error rows
drop_encoding_error_rows = df[df['review'].str.contains('½ï¿½ï¿', na=False)]
df.drop(drop_encoding_error_rows.index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [28]:
df['review'].unique()

array(['I ordered the wrong size tee and had difficulties returning it on the website. I contacted Tee Public and they responded almost immediately. Resolved my issue and sent me a new tee shirt. High praise for the excellent customer service!',
       "These guys offer the best customer service in all of retail! Their products are off high quality and there's a fantastic selection of design and product options.",
       "Looked for an obscure phrase on a shirt. Teepublic had it. The process was easy, and the quality is better than expected. Sizes are really true, which is so very rare anymore. Don't hesitate.",
       ..., 'Great quality, fast delivery, would recommend!',
       'Dudes rock. Seriously.',
       'Shipping was fast the T-shirt was just right good company for suicide boys'],
      dtype=object)

In [29]:
# resampling all values equals to 12000
from sklearn.utils import resample

df_5 = df[df['sentiment_score'] == 5]
df_4 = df[df['sentiment_score'] == 4]
df_3 = df[df['sentiment_score'] == 3]
df_2 = df[df['sentiment_score'] == 2]
df_1 = df[df['sentiment_score'] == 1]

df_5_downsampled = resample(df_5, replace=False, n_samples=8000, random_state=42)
df_4_downsampled = resample(df_4, replace=False, n_samples=8000, random_state=42)
df_3_downsampled = resample(df_3, replace=False, n_samples=8000, random_state=42)
df_2_downsampled = resample(df_2, replace=True, n_samples=8000, random_state=42)
df_1_downsampled = resample(df_1, replace=False, n_samples=8000, random_state=42)

df = pd.concat([df_5_downsampled, df_4_downsampled, df_3_downsampled, df_2_downsampled, df_1_downsampled])

In [30]:
df['sentiment_score'].value_counts()

5    8000
4    8000
3    8000
2    8000
1    8000
Name: sentiment_score, dtype: int64

In [31]:
df.head()

Unnamed: 0,review,sentiment_score
164714,Fantastic experience! From time ordered to rec...,5
137607,Super cozy hoodie. Love the design. Get compli...,5
163916,Every shirt I have bought has been great and t...,5
155547,"I ordered 3 items, very quickly received a con...",5
84792,I bought 2 shirts from TeePublic. They are bot...,5


In [32]:
df.tail

<bound method NDFrame.tail of                                                    review  sentiment_score
164714  Fantastic experience! From time ordered to rec...                5
137607  Super cozy hoodie. Love the design. Get compli...                5
163916  Every shirt I have bought has been great and t...                5
155547  I ordered 3 items, very quickly received a con...                5
84792   I bought 2 shirts from TeePublic. They are bot...                5
...                                                   ...              ...
94507   Worst quality ever. Don't waist your money her...                1
118174  T-shirt is very thin and the design is crooked...                1
111031  Designers are victim from sales and discounts....                1
112296  The shirt I ordered does not look like the ori...                1
159454  Loved the design, ordered it for a birthday gi...                1

[40000 rows x 2 columns]>

In [33]:
df.sample(n=10, random_state=42)

Unnamed: 0,review,sentiment_score
88287,well we thought we were ordering ernie ball gu...,1
208474,Poor quality fabric; size is pretty large and ...,3
159857,The t-shirt was poor quality and the Adult Lar...,2
120181,Amazing you guys did everything smooth and fas...,5
134872,Material of shirt is very poorly made neck is ...,2
184087,"Super low quality t-shirt, extra thin, returni...",1
140453,The products are well printed. The shipment an...,4
134371,Never got a copy of my order until I had to em...,2
133717,I ordered 3 t-shirts.2 of them were supposed t...,3
42092,"Banner was pixilated and blurry, very poor qua...",2


In [34]:
df['sentiment_score'].value_counts()

5    8000
4    8000
3    8000
2    8000
1    8000
Name: sentiment_score, dtype: int64

In [35]:
df['review'].value_counts()

Poor quality masks which are far to small and dont cover the face correctly                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            6
The sizing was extremely small. And the artwork was just average or slightly below. I will be returning it after Christmas.                                                                                                                                                                                                                                                                                    

---
#Model Fine-Tuning
---

In [36]:
# importing libraries
from transformers import BertTokenizer, TFBertForSequenceClassification

In [37]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

In [38]:
# Encoding text data
max_length = 128

In [39]:
# Encoding text data
encoded_data = tokenizer(df['review'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
labels = df['sentiment_score']

In [40]:
dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': encoded_data['input_ids'], 'attention_mask': encoded_data['attention_mask']},
    labels
))

In [41]:
# Define the dataset size and split ratio
dataset_size = len(labels)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

# Split the dataset into training and validation sets
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [42]:
# Batch the datasets
batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=train_size).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [43]:
# Define the BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

tf_model.h5:   0%|          | 0.00/670M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [44]:
# Fine-tuning the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [45]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [48]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=7
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [49]:
# Evaluate the model
evaluation = model.evaluate(val_dataset)



In [50]:
# Save the fine-tuned model
model.save_pretrained('fb_sentimental_bert_model')

In [51]:
model.save_pretrained('/content/drive/MyDrive/d/model')
# Save the model to a local directory
# model.save_pretrained(r'C:\Users\KW\Downloads\fb_sentimental_bert_model')

In [52]:
!zip -r /fb_sentimental_bert_model.zip -i /fb_sentimental_bert_model


zip error: Invalid command arguments (nothing to select from)


----------------------------------
# Training & Testing Model
----------------------------------

In [53]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

In [54]:
# Load the fine-tuned BERT model from the saved directory
model_path = 'fb_sentimental_bert_model'
loaded_model = TFBertForSequenceClassification.from_pretrained(model_path)

Some layers from the model checkpoint at fb_sentimental_bert_model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at fb_sentimental_bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [55]:
# Tokenize the test text
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
max_length = 128

In [56]:
# test_text = "love like awesome good best nice"
test_array = ["Very cute coffee shop and restaurant. They have a lovely outdoor seating area and several tables inside.  It was fairly busy on a Tuesday morning but we were to grab the last open table. The server was so enjoyable, she chatted and joked with us and provided fast service with our ordering, drinks and meals. The food was very good. We ordered a wide variety and every meal was good to delicious. The sweet potato fries on the Chicken Burger plate were absolutely delicious, some of the best I've ever had. I definitely enjoyed this cafe, the outdoor seating, the service and the food!!",
 "Six of us met here for breakfast before our walk to Manly. We were enjoying visiting with each other so much that I apologize for not taking any photos. We all enjoyed our food, as well as our coffee and tea drinks.We were greeted immediately by a friendly server asking if we would like to sit inside or out. We said we would like inside, but weren't exactly sure how many were joining us yet- at least 4. We were told this was no problem, the more the merrier. A few minutes later when 4 more joined our party and we explained to the server we had 6, he just quickly switched our table. I really enjoyed my serenity tea, just what I needed after a long flight in from Sfo that morning. Everyone else were more interested in the lattes for expresso drinks. All said they were hot and delicious. 2 of us ordered the avo on toast. So yummy with the beetroot... I will start adding this to mine now at home, and have fond memories for my trip to Sydney. 2 friends ordered the salmon Benedict- saying it was delicious, and their go to every time they come here. 2 friends had a breakfast sandwich- I'm not sure of the name. It did look delicious. Adorable cafe, friendly staff, clean restroomsVery popular with the locals. I plan to come back the next time I'm in Sydney",
 "Great service, lovely location, and really amazing food. Words don't do justice. We had the mushroom parm bruschetta and the loaded double double. Both were so tasty. Also love the Aussie black tea and a flat white. Wish I had more mornings in Sydney to eat breakfast there. Highly recommend.",
 'Great place with delicious food and friendly staff. It is small but has outdoor seating and a relaxed ambiance. Perfect place to enjoy a cup of coffee. I am visiting Sydney for the first time but this place seems like is a local favorite.',
 'Some of the best Milkshakes me and my daughter ever tasted. MMMMMM HMMMMMMMM.',
 'Great food amazing coffee and tea. Short walk from the harbor. Staff was very friendly',
 "It was ok. Had coffee with my friends. I'm new in the area, still need to discover new places.",
 "Ricotta hot cakes! These were so yummy. I ate them pretty fast and didn't share with anyone because they were that good ;). I ordered a green smoothie to balance it all out. Smoothie was a nice way to end my brekkie at this restaurant. Others with me ordered the salmon Benedict and the smoked salmon flatbread. They were all delicious and all plates were empty. Cheers!",
 "We came for brunch twice in our week-long visit to Sydney. Everything on the menu not only sounds delicious, but is really tasty. It really gave us a sour taste of how bad breaky is in America with what's so readily available in Sydney!  Both days we went were Saturdays and there was a bit of a wait to be seated, the cafe is extremely busy for both dine-in and take-away. Service is fairly quick and servers are all friendly. The location is in Surrey Hills a couple blocks away from the bustling touristy Darling Harbor.The green smoothie is very tasty and refreshing. We tried the smoked salmon salad, the soft shell crab tacos, ricotta hotcakes, and the breaky sandwich. All were delicious, well seasoned, and a solid amount of food for the price. A definite recommend for anyone's trip into Sydney!",
 'Great staff and food.  Must try is the pan fried Gnocchi!  The staff were really friendly and the coffee was good as well']

In [57]:
predictions = []

for text in test_array:
    # Tokenize the text
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    # Get predictions for the current text sample
    prediction = loaded_model(encoded_text)
    predicted_class = tf.argmax(prediction.logits, axis=1).numpy()[0]
    predictions.append(predicted_class)

In [58]:
# Interpret the predicted class based on your sentiment score mapping
sentiment_mapping = {
    0: 'Negative',
    1: 'Negative',
    2: 'Neutral',
    3: 'Positive',
    4: 'Very positive'
}

# Map predictions to sentiments
predicted_sentiments = [sentiment_mapping[prediction] for prediction in predictions]

# Print predicted sentiments for each text sample
for idx, sentiment in enumerate(predicted_sentiments):
    print(f"{idx+1} - {sentiment}")

1 - Negative
2 - Negative
3 - Negative
4 - Negative
5 - Negative
6 - Negative
7 - Negative
8 - Negative
9 - Negative
10 - Negative


In [59]:
from collections import Counter
import json

# Count occurrences of each sentiment
sentiment_counts = Counter(predicted_sentiments)

# Convert to a JSON-like format
sentiment_json = {sentiment: count for sentiment, count in sentiment_counts.items()}

# Print the JSON representation
print(json.dumps(sentiment_json, indent=2))


{
  "Negative": 10
}
