# Step 1: Import Libraries
First, ensure you have the necessary libraries installed. You can install them using pip if you haven't already:


In [1]:
!pip install pandas nltk scikit-learn transformers

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 4.7 MB/s eta 0:00:00
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
   ------------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Step 2: Load the Dataset
Load the CSV file into a pandas DataFrame.

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('banking_dispute_dataset_zzu.csv')

# Display the first few rows of the dataset
print(df.head())

  Customer ID Customer Type Account Type Customer Region  \
0     CUS4823           New     Checking         Georgia   
1     CUS1375      Existing      Savings         Florida   
2     CUS2001      Existing     Checking           Texas   
3     CUS7425           New      Savings         Florida   
4     CUS2297      Existing      Savings        New York   

  Customer Loyalty Tier Dispute ID   Dispute Type Dispute Date  \
0                Silver    PIJDMT6  Service Issue   2024-04-06   
1                Silver    T75GRYY   Account Lock   2024-09-25   
2              Platinum    J0E9RYA  Billing Error   2024-09-17   
3              Platinum    MRWDLY1          Fraud   2024-01-10   
4                  Gold    IDERU57  Service Issue   2024-04-06   

   Dispute Resolution Time       Dispute Status  ... Transaction Type  \
0                        3  Under Investigation  ...  Online Purchase   
1                        7              Pending  ...       Withdrawal   
2                      

# Step 3: Preprocess the Text Data
Preprocess the feedback comments to clean and normalize the text.

In [6]:
import nltk

# Download the required NLTK data packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hamim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hamim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hamim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Test the installation by accessing the Brown Corpus
from nltk.corpus import brown
print(brown.words()[:10])  # Print the first 10 words of the Brown Corpus

LookupError: 
**********************************************************************
  Resource [93mbrown[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('brown')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/brown[0m

  Searched in:
    - 'C:\\Users\\hamim/nltk_data'
    - 'C:\\Users\\hamim\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'C:\\Users\\hamim\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'C:\\Users\\hamim\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\hamim\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


Step 4: Feature Extraction
Convert the preprocessed text into numerical features using TF-IDF Vectorizer.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the feedback comments
X = vectorizer.fit_transform(df['Feedback Comments']).toarray()

# Display the feature names
print(vectorizer.get_feature_names_out())

Step 5: Label Encoding
Encode the dispute types and resolution methods for modeling.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the dispute types
label_encoder_dispute = LabelEncoder()
df['Dispute Type'] = label_encoder_dispute.fit_transform(df['Dispute Type'])

# Encode the resolution methods
label_encoder_resolution = LabelEncoder()
df['Dispute Resolution Method'] = label_encoder_resolution.fit_transform(df['Dispute Resolution Method'])

Step 6: Train-Test Split
Split the dataset into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split

# Define the target variable (e.g., customer satisfaction score)
y = df['Customer Satisfaction Score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 7: Model Training
Train a machine learning model to predict customer satisfaction based on feedback.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Step 8: Advanced NLP (Optional)
For more advanced NLP tasks, you can use transformers like BERT for better text classification.

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Tokenize the feedback comments
def tokenize_function(examples):
    return tokenizer(examples['Feedback Comments'], padding="max_length", truncation=True)

# Convert the dataset to a format suitable for transformers
from datasets import Dataset

dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

Step 9: Save the Model
Save the trained model for future use.

In [None]:
import joblib

# Save the model
joblib.dump(model, 'customer_satisfaction_model.pkl')

Step 10: Load and Use the Model
Load the model and make predictions on new data.

In [None]:
# Load the model
model = joblib.load('customer_satisfaction_model.pkl')

# Example prediction
new_feedback = ["Very satisfied with the resolution"]
new_feedback_preprocessed = preprocess_text(new_feedback[0])
new_feedback_vectorized = vectorizer.transform([new_feedback_preprocessed])
prediction = model.predict(new_feedback_vectorized)
print("Predicted Customer Satisfaction Score:", prediction)