In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import re
import joblib

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nehadhananju/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
data=pd.read_csv('/Users/nehadhananju/Desktop/DeepCSAT/data/raw/data.csv')

In [4]:
print("Initial Shape:", data.shape)
print("\nMissing Values:\n", data.isnull().sum())


Initial Shape: (85907, 20)

Missing Values:
 Unique id                      0
channel_name                   0
category                       0
Sub-category                   0
Customer Remarks           57165
Order_id                   18232
order_date_time            68693
Issue_reported at              0
issue_responded                0
Survey_response_Date           0
Customer_City              68828
Product_category           68711
Item_price                 68701
connected_handling_time    85665
Agent_name                     0
Supervisor                     0
Manager                        0
Tenure Bucket                  0
Agent Shift                    0
CSAT Score                     0
dtype: int64


## Handling Missing values

In [6]:
# Drop low-value columns
data = data.drop(columns=['connected_handling_time', 'Manager', 'Supervisor', 'Order_id', 'order_date_time'], errors='ignore')

In [8]:
# Fill missing fields

# Text columns
data['Customer Remarks'] = data['Customer Remarks'].fillna('No Remarks')
data['Customer_City'] = data['Customer_City'].fillna('Unknown')
data['Product_category'] = data['Product_category'].fillna('Unknown')

# Numeric column
data['Item_price'] = data['Item_price'].fillna(data['Item_price'].median())



## Feature Engineering

In [14]:
# Feature Engineering - Time Features
data['Issue_reported at']=pd.to_datetime(data['Issue_reported at'],dayfirst=True, errors='coerce')
data['issue_responded']=pd.to_datetime(data['issue_responded'],dayfirst=True, errors='coerce')



In [16]:
# Calculate response time in hours
data['response_time_hours'] = (data['issue_responded'] - data['Issue_reported at']).dt.total_seconds() / 3600
data['response_time_hours'] = data['response_time_hours'].fillna(data['response_time_hours'].median())


In [17]:
#  Drop unused datetime columns
data = data.drop(columns=['Issue_reported at', 'issue_responded','Survey_response_Date'], errors='ignore')


## Text Cleaning

In [18]:
#  Text Cleaning - Customer Remarks
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

data['cleaned_remarks'] = data['Customer Remarks'].apply(clean_text)


In [19]:
# TF-IDF vectorization for remarks
vectorizer = TfidfVectorizer(max_features=100)
remarks_tfidf = vectorizer.fit_transform(data['cleaned_remarks']).toarray()
remarks_df = pd.DataFrame(remarks_tfidf, columns=[f'tfidf_{i}' for i in range(remarks_tfidf.shape[1])])



## Encoding Categorical Variables

In [22]:
cat_cols = [ 'channel_name', 'category', 'Sub-category',  'Customer_City', 'Product_category',  'Agent_name', 'Tenure Bucket', 'Agent Shift']
# Use LabelEncoder for simplicity 
le = LabelEncoder()
for col in cat_cols:
    data[col] = le.fit_transform(data[col])


## Scaling Numerical Features

In [25]:
num_cols = ['Item_price', 'response_time_hours']
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

## Combining all features

In [26]:
# 8️⃣ Combine All Features
X = pd.concat([data[cat_cols + num_cols], remarks_df], axis=1)
y = data['CSAT Score']

print("Feature matrix shape:", X.shape)
print("Target variable distribution:\n", y.value_counts())


Feature matrix shape: (85907, 110)
Target variable distribution:
 CSAT Score
5    59617
1    11230
4    11219
3     2558
2     1283
Name: count, dtype: int64


## Train_Test Split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# Save Preprocessed Data and Objects
X_train.to_csv("/Users/nehadhananju/Desktop/DeepCSAT/data/processed/X_train.csv", index=False)
X_test.to_csv("/Users/nehadhananju/Desktop/DeepCSAT/data/processed/X_test.csv", index=False)
y_train.to_csv("/Users/nehadhananju/Desktop/DeepCSAT/data/processed/y_train.csv", index=False)
y_test.to_csv("/Users/nehadhananju/Desktop/DeepCSAT/data/processed/y_test.csv", index=False)


Train shape: (68725, 110) Test shape: (17182, 110)


In [29]:
#  Save encoders/scalers/vectorizer 
joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(vectorizer, "../models/vectorizer.pkl")
joblib.dump(le,"../models/label_encoder.pkl")

['../models/label_encoder.pkl']