In [1]:

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

In [2]:

# Load a CSV with text from transcribed calls for classification
# Task: train a model to predict whether a call was made pre or post purchase based on transcription
# Labels: pre-purchase vs post-purchase
calls_df = pd.read_csv('data/customer_call_transcriptions.csv')

In [13]:
calls_df.head()

Unnamed: 0,label,text
0,pre_purchase,how's it going Arthur I just placed an order w...
1,post_purchase,yeah hello I'm just wondering if I can speak t...
2,post_purchase,hey I receive my order but it's the wrong size...
3,pre_purchase,hi David I just placed an order online and I w...
4,post_purchase,hey I bought something from your website the o...


In [14]:
calls_df.tail()

Unnamed: 0,label,text
97,post_purchase,yeah hello I'm just wondering if I can speak t...
98,pre_purchase,hi I recently ordered a new phone and I'm just...
99,pre_purchase,just looking to get some more information on t...
100,pre_purchase,hi I just realised I ordered the wrong compute...
101,post_purchase,hey mate how you doing I'm just calling in reg...


In [15]:
calls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   102 non-null    object
 1   text    102 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [3]:

# Split data into 70% training and 30% testing set
X_train, X_test, y_train, y_test = train_test_split(calls_df['text'], calls_df['label'], test_size=0.3)

In [4]:

# Create classification pipeline
clf_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [5]:

# Train model
clf_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [6]:

# Run inference on testing data
y_pred = clf_pipeline.predict(X_test)

In [18]:

# Evaluate model performance
acc = np.mean(y_pred == y_test) * 100
print(f"The model is {acc}% accurate.")

The model is 100.0% accurate.
