<a href="https://colab.research.google.com/github/omid-sakaki-ghazvini/Projects/blob/main/Classification_with_FineTuning_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies and Setup

<div style="direction:rtl">
<font color='green' size="5px">
 کتابخانه های مورد نیاز را نصب میکنیم
    </font>
</div>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter
import re, string

import tensorflow as tf
from transformers import BertTokenizer, AutoTokenizer, TFAutoModel

import warnings

warnings.filterwarnings('ignore')

In [None]:
# check for the GPU
!nvidia-smi

<div style="direction:rtl">
<font color='green' size="5px">
    از لینک زیر دیتاست را دانلود کرده و در پوشه هم مسیر همین ژوپیتر نوت بوک قرار دهید یا خط فرمان زیر را اجرا نمایید
    </font>
</div>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jagathratchakan/indian-airlines-customer-reviews")

print("Path to dataset files:", path)

## https://www.kaggle.com/datasets/jagathratchakan/indian-airlines-customer-reviews

# 2. Load Data

<div style="direction:rtl">
<font color='green' size="5px">
توسط خط فرمان زیر، دیتا را فراخوانی میکنیم
    </font>
</div>

In [None]:
df=pd.read_csv('/kaggle/input/indian-airlines-customer-reviews/Indian_Domestic_Airline.csv')
df.head(10)

In [None]:
df.info()

# 3.Data Analysis

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

In [None]:
df.dropna(subset=['Rating - 10'],inplace=True)

In [None]:
def pos_neg(x):
    if x >= 5:
        return "Postive"
    return "Negative"

df["Rating - 10"] = df["Rating - 10"].apply(pos_neg)

cnt = Counter(df["Rating - 10"])
cnt = dict(cnt)
print(cnt)

sizes = list(cnt.values())
labels = list(cnt.keys())
colors = ['#3fba36', '#66b3ff','#ffcc99','#ff9999', '#d44444']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', startangle=90)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
ax1.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
AirLine_Name = df['AirLine_Name'].unique()
for i in AirLine_Name:
    df_test = df.loc[df['AirLine_Name']==i]

    cnt = Counter(df_test["Recommond"])
    cnt = dict(cnt)
    print('Recommond: ',cnt)

    sizes = list(cnt.values())
    labels = list(cnt.keys())
    colors = ['#3fba36', '#66b3ff','#ffcc99','#ff9999', '#d44444']
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', startangle=90)
    #draw circle
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    ax1.axis('equal')
    plt.title(i)
    plt.tight_layout()
    plt.show()

# 4.Fine Tuning BERT

## 4.1.Data preparation

In [None]:
df['Title'] = df['Title'].str.replace('"', '')

df['Review'] = df['Review'].str.replace("✅ Trip Verified |", "")
df['Review'] = df['Review'].str.replace("not verified |", "")

In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['text'] = df['AirLine_Name'].str.casefold() + ' ' + df['Rating - 10'].str.casefold() + ' ' + df['Title'].str.casefold() + ' ' + df['Review'].str.casefold()
df["text"] = df["text"].apply(wordopt)

In [None]:
df = df.drop(columns=['AirLine_Name','Rating - 10','Title', 'Name', 'Date', 'Review'])
df['Recommond'] = df['Recommond'].map({'yes': 1, 'no': 0})
df.head(10)

In [None]:
textlen = df['text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(8,5))
plt.hist(textlen, edgecolor = "black");

In [None]:
SEQ_LEN = 30

## 4.2.Encoding the text for Bert model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
train_encoded_inputs = tokenizer(df['text'].tolist(),
                                 add_special_tokens = True,
                                 padding='max_length',
                                 truncation=True,
                                 max_length=SEQ_LEN,
                                 return_token_type_ids=False,
                                 return_tensors = 'tf')

## 4.3.Creating Tensorflow data

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_encoded_inputs, df['Recommond'].values))

In [None]:
def map_bert(inputs, labels):
    inputs = {'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask']}

    return inputs, labels

In [None]:
train_dataset = train_dataset.map(map_bert)

In [None]:
for t in train_dataset.take(2):
    print (t)

## 4.4.Creating train and test split

In [None]:
dataset = train_dataset.shuffle(100000).batch(64)

In [None]:
DS_LEN = len(dataset)
DS_LEN

In [None]:
SPLIT = 0.8
train_ds = dataset.take(round(DS_LEN*SPLIT))
val_ds = dataset.skip(round(DS_LEN*SPLIT))

print(train_ds)

# 5.Modeling

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-uncased')

In [None]:
# Input layers
input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), dtype=np.int32, name='input_ids' )
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), dtype=np.int32,  name = 'attention_mask')

# bert embeddings
embeddings = bert([input_ids, mask])[0]
cls_token = embeddings[:,0,:]

# keras layers
#x = tf.keras.layers.GlobalMaxPool1D()(embeddings)
x = tf.keras.layers.BatchNormalization()(cls_token)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)

# output layer
y = tf.keras.layers.Dense(1, activation='sigmoid')(x)

# create the model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [None]:
model.summary()

In [None]:
model.layers[2].trainable = False
model.summary()

# 6.Compile model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)
#optimizer = transformers.AdamWeightDecay(learning_rate=5e-4)
loss = tf.keras.losses.BinaryCrossentropy()
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.1,min_delta=0.001,monitor='val_loss'),
             tf.keras.callbacks.EarlyStopping(patience=5, min_delta=0.001, monitor='val_loss')]

In [None]:
# train the model

history = model.fit(train_ds, validation_data = val_ds, epochs = 50, callbacks = callbacks)

In [None]:
# plot the performance curve
epochs = history.epoch
plt.figure(figsize=(15, 6))

# Accuracy
plt.subplot(1,2, 1)
plt.plot(epochs, history.history['accuracy'], label="Train")
plt.plot(epochs, history.history['val_accuracy'], label = "Val")
plt.legend()
plt.title("Accuracy")

# loss
plt.subplot(1,2, 2)
plt.plot(epochs, history.history['loss'], label="Train")
plt.plot(epochs, history.history['val_loss'], label = "Val")
plt.legend()
plt.title("Loss")

plt.show()

In [None]:
_,accuracy = model.evaluate(val_ds)

print('accuracy :', accuracy)