## Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("public_data.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11623 entries, 0 to 11622
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       11623 non-null  int64 
 1   message  11623 non-null  object
 2   label    11623 non-null  object
dtypes: int64(1), object(2)
memory usage: 272.5+ KB
None
      id                                            message  \
0   8793    hi i want change my address from my credit card   
1   3083  i need 4 fruit maple oatmeal 3 cold brew froze...   
2   5932        i wish to travel next month domestic airway   
3  12077                   i need reimbursement my expenses   
4   6608              i need a copy of insurance for my car   

                 label  
0        updateaddress  
1     orderdrinkintent  
2           bookflight  
3        expensereport  
4  getproofofinsurance  


In [3]:
X = df['message']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(9298,)
(9298,)
(2325,)
(2325,)


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import unidecode
import re

In [5]:
# Preprocessing functions
def remove_unwanted_chars(df):
    return df.apply(lambda x: unidecode.unidecode(x))

def remove_numbers(df):
    return df.apply(lambda x: re.sub(r'\d+', '', x))

def tokenize_message(df):
    return df.apply(lambda x: word_tokenize(x))

def remove_stopwords(df):
    return df.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

def lemmatize_message(df):
    lemmatizer = WordNetLemmatizer()
    return df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [6]:
# Make FunctionTransformers for custom preprocessing functions
from sklearn.preprocessing import FunctionTransformer

preprocess_pipe = Pipeline([('removeunwanted', FunctionTransformer(remove_unwanted_chars)),
                            ('removenumbers', FunctionTransformer(remove_numbers)),
                            ('tokenize', FunctionTransformer(tokenize_message)),
                            ('removestopwords', FunctionTransformer(remove_stopwords)),
                            ('lemmatize', FunctionTransformer(lemmatize_message))])

## Catboost

In [7]:
catboost_params = {
    'iterations': 3000,
    'learning_rate': 0.01,
    'verbose': 500,
    'random_state': 42
}

In [9]:
from catboost import CatBoostClassifier

vectorizer = TfidfVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('preprocess', preprocess_pipe),
                 ('vec', vectorizer),
                 ('clf', CatBoostClassifier(**catboost_params))])

pipe.fit(X_train, y_train)

0:	learn: 3.5065087	total: 428ms	remaining: 21m 24s
500:	learn: 0.9104903	total: 1m 51s	remaining: 9m 16s
1000:	learn: 0.6580553	total: 3m 42s	remaining: 7m 25s
1500:	learn: 0.5620362	total: 5m 27s	remaining: 5m 27s
2000:	learn: 0.5191066	total: 7m 12s	remaining: 3m 35s
2500:	learn: 0.4936887	total: 8m 59s	remaining: 1m 47s
2999:	learn: 0.4758789	total: 10m 42s	remaining: 0us


Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('removeunwanted',
                                  FunctionTransformer(func=<function remove_unwanted_chars at 0x0000027046CABD30>)),
                                 ('removenumbers',
                                  FunctionTransformer(func=<function remove_numbers at 0x0000027046CABCA0>)),
                                 ('tokenize',
                                  FunctionTransformer(func=<function tokenize_message at 0x0000027046CABEE0>)),
                                 ('removestopwords',
                                  FunctionTransformer(func=<function remove_stopwords at 0x0000027046CABF70>)),
                                 ('lemmatize',
                                  FunctionTransformer(func=<function lemmatize_message at 0x0000027046CBD040>))])),
                ('vec',
                 TfidfVectorizer(max_df=0.8, max_features=300, min_df=0.01,
                                 ngram_range=[1, 1], 

### Model evaluation

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

pred_val = pipe.predict(X_val)
pred_val = pred_val.reshape(-1)
print(confusion_matrix(y_val, pred_val))
print(classification_report(y_val, pred_val))

[[ 41   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0   0]
 [  0   0 182 ...   0   0   1]
 ...
 [  0   0   0 ...  69   0   0]
 [  0   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   7]]
                       precision    recall  f1-score   support

           bookflight       0.95      0.98      0.96        42
          changeorder       0.50      0.20      0.29         5
 changeseatassignment       0.88      0.95      0.91       192
         checkbalance       0.98      0.98      0.98        50
     checkclaimstatus       1.00      0.96      0.98        90
checkoffereligibility       0.55      1.00      0.71         6
    checkserverstatus       0.90      0.90      0.90        30
         closeaccount       0.93      0.70      0.80        20
        disputecharge       0.95      0.43      0.59        42
        expensereport       0.97      0.96      0.97        77
      getboardingpass       1.00      0.99      1.00       114
 getinformationintent       0.78      0.78      0.7

### Cross-validation

In [11]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(pipe, X_train, y_train, scoring='adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

0:	learn: 3.5077941	total: 187ms	remaining: 9m 20s
500:	learn: 0.9316720	total: 1m 28s	remaining: 7m 19s
1000:	learn: 0.6751859	total: 2m 54s	remaining: 5m 47s
1500:	learn: 0.5736263	total: 4m 20s	remaining: 4m 19s
2000:	learn: 0.5272155	total: 5m 50s	remaining: 2m 54s
2500:	learn: 0.4999535	total: 7m 20s	remaining: 1m 27s
2999:	learn: 0.4817012	total: 8m 49s	remaining: 0us
0:	learn: 3.5118956	total: 182ms	remaining: 9m 6s
500:	learn: 0.9303100	total: 1m 29s	remaining: 7m 25s
1000:	learn: 0.6732082	total: 2m 58s	remaining: 5m 56s
1500:	learn: 0.5721931	total: 4m 31s	remaining: 4m 30s
2000:	learn: 0.5286861	total: 6m 8s	remaining: 3m 4s
2500:	learn: 0.5031695	total: 7m 41s	remaining: 1m 32s
2999:	learn: 0.4840802	total: 9m 22s	remaining: 0us
0:	learn: 3.5094488	total: 193ms	remaining: 9m 40s
500:	learn: 0.9325688	total: 1m 28s	remaining: 7m 21s
1000:	learn: 0.6768367	total: 3m 2s	remaining: 6m 4s
1500:	learn: 0.5762421	total: 4m 43s	remaining: 4m 42s
2000:	learn: 0.5300739	total: 6m 35s

In [12]:
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score

ari = adjusted_rand_score(y_val, pred_val)
ami = adjusted_mutual_info_score(y_val, pred_val, average_method='arithmetic')

print("ARI: {}".format(ari))
print("AMI: {}".format(ami))

ARI: 0.8204243276265076
AMI: 0.8583743543840325


## Predictions on test data

In [13]:
df_test = pd.read_csv('input_data.csv')
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2906 non-null   int64 
 1   message  2906 non-null   object
dtypes: int64(1), object(1)
memory usage: 45.5+ KB


Unnamed: 0,id,message
0,12123,i have problem in excel
1,244,i need \t pesto drizzle over grilled chicken c...
2,8221,need to help order a new card as the old one e...
3,12856,i need internet plan
4,12108,my are report travel


In [14]:
pred_test = pipe.predict(df_test['message'])

In [15]:
pred_test = pred_test.reshape(-1) # reshape to 1D for predictions with Catboost model

In [16]:
my_submission = pd.DataFrame({'Id': df_test['id'], 'label': pred_test})
print(my_submission.head())

      Id               label
0  12123    orderdrinkintent
1    244   orderburgerintent
2   8221         replacecard
3  12856  startserviceintent
4  12108       expensereport


In [17]:
compression_opts = dict(method='zip', archive_name='coda_submission.csv')
my_submission.to_csv('coda_submission.zip', index=False, compression=compression_opts)