# Step 1: Import Libraries
First, ensure you have the necessary libraries installed. You can install them using pip if you haven't already:


In [3]:
!pip install pandas nltk scikit-learn transformers



# Step 2: Load the Dataset
Load the CSV file into a pandas DataFrame.

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv('banking_dispute_dataset_zzu.csv')

# Display the first few rows of the dataset
print(df.head())

  Customer ID Customer Type Account Type Customer Region  \
0     CUS4823           New     Checking         Georgia   
1     CUS1375      Existing      Savings         Florida   
2     CUS2001      Existing     Checking           Texas   
3     CUS7425           New      Savings         Florida   
4     CUS2297      Existing      Savings        New York   

  Customer Loyalty Tier Dispute ID   Dispute Type Dispute Date  \
0                Silver    PIJDMT6  Service Issue   2024-04-06   
1                Silver    T75GRYY   Account Lock   2024-09-25   
2              Platinum    J0E9RYA  Billing Error   2024-09-17   
3              Platinum    MRWDLY1          Fraud   2024-01-10   
4                  Gold    IDERU57  Service Issue   2024-04-06   

   Dispute Resolution Time       Dispute Status  ... Transaction Type  \
0                        3  Under Investigation  ...  Online Purchase   
1                        7              Pending  ...       Withdrawal   
2                      

# Step 3: Preprocess the Text Data
Preprocess the feedback comments to clean and normalize the text.

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the feedback comments
df['Feedback Comments'] = df['Feedback Comments'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Step 4: Feature Extraction
Convert the preprocessed text into numerical features using TF-IDF Vectorizer.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the feedback comments
X = vectorizer.fit_transform(df['Feedback Comments']).toarray()

# Display the feature names
print(vectorizer.get_feature_names_out())

['better' 'could' 'dissatisfied' 'neutral' 'recommend' 'satisfied']


Step 5: Label Encoding
Encode the dispute types and resolution methods for modeling.

In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode the dispute types
label_encoder_dispute = LabelEncoder()
df['Dispute Type'] = label_encoder_dispute.fit_transform(df['Dispute Type'])

# Encode the resolution methods
label_encoder_resolution = LabelEncoder()
df['Dispute Resolution Method'] = label_encoder_resolution.fit_transform(df['Dispute Resolution Method'])

Step 6: Train-Test Split
Split the dataset into training and testing sets.

In [8]:
from sklearn.model_selection import train_test_split

# Define the target variable (e.g., customer satisfaction score)
y = df['Customer Satisfaction Score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 7: Model Training
Train a machine learning model to predict customer satisfaction based on feedback.

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.13
Classification Report:
               precision    recall  f1-score   support

           1       0.08      0.04      0.05        25
           2       0.16      0.44      0.24        18
           3       0.12      0.14      0.13        22
           4       0.00      0.00      0.00        22
           5       0.08      0.08      0.08        13

    accuracy                           0.13       100
   macro avg       0.09      0.14      0.10       100
weighted avg       0.09      0.13      0.09       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Step 8: Advanced NLP (Optional)
For more advanced NLP tasks, you can use transformers like BERT for better text classification.

In [11]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import pandas as pd

# Load the dataset
file_path = 'banking_dispute_dataset_zzu.csv'
df = pd.read_csv(file_path)

# Map Customer Satisfaction Score to integer labels (if needed)
def map_satisfaction_score(example):
    # Map the satisfaction score to an integer (1 to 5 scale)
    example['labels'] = example['Customer Satisfaction Score'] - 1  # Making the score range from 0 to 4
    return example

# Apply the label mapping
df = df.apply(map_satisfaction_score, axis=1)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the feedback comments
def tokenize_function(examples):
    return tokenizer(examples['Feedback Comments'], padding="max_length", truncation=True)

# Convert the dataset to a format suitable for transformers
dataset = Dataset.from_pandas(df)

# Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # Assuming 5 satisfaction levels (0-4)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

# Train the model
trainer.train()


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.599221
2,No log,1.598098
3,No log,1.596956


TrainOutput(global_step=96, training_loss=1.6211144129435222, metrics={'train_runtime': 197.9844, 'train_samples_per_second': 7.576, 'train_steps_per_second': 0.485, 'total_flos': 394677213696000.0, 'train_loss': 1.6211144129435222, 'epoch': 3.0})

Step 9: Save the Model
Save the trained model for future use.

In [16]:
import joblib

# Save the model
joblib.dump(model, 'customer_satisfaction_model.pkl')

['customer_satisfaction_model.pkl']

In [22]:
# Save the model weights (state_dict) and architecture
torch.save(model.state_dict(), 'customer_satisfaction_model.pth')


Step 10: Load and Use the Model
Load the model and make predictions on new data.

In [23]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the pre-trained model architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Load the model weights from the .pkl file
# model.load_state_dict(torch.load('customer_satisfaction_model.pkl'))
model.load_state_dict(torch.load('customer_satisfaction_model.pth'))


# Example feedback
new_feedback = ["Very satisfied with the resolution"]

# Preprocess and tokenize the feedback
inputs = tokenizer(new_feedback, padding=True, truncation=True, return_tensors="pt")

# Perform inference (forward pass)
with torch.no_grad():
    outputs = model(**inputs)

# Get the logits (raw predictions) from the model output
logits = outputs.logits

# Get the predicted class (index of highest logit)
predicted_class = torch.argmax(logits, dim=-1).item()

# Print the predicted customer satisfaction score (class)
print("Predicted Customer Satisfaction Score:", predicted_class)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('customer_satisfaction_model.pth'))


Predicted Customer Satisfaction Score: 0


In [None]:
from google.colab import drive
drive.mount('/content/drive')