## Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("public_data.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11623 entries, 0 to 11622
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       11623 non-null  int64 
 1   message  11623 non-null  object
 2   label    11623 non-null  object
dtypes: int64(1), object(2)
memory usage: 272.5+ KB
None
      id                                            message  \
0   8793    hi i want change my address from my credit card   
1   3083  i need 4 fruit maple oatmeal 3 cold brew froze...   
2   5932        i wish to travel next month domestic airway   
3  12077                   i need reimbursement my expenses   
4   6608              i need a copy of insurance for my car   

                 label  
0        updateaddress  
1     orderdrinkintent  
2           bookflight  
3        expensereport  
4  getproofofinsurance  


In [3]:
X = df['message']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(9298,)
(9298,)
(2325,)
(2325,)


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import unidecode
import re

In [5]:
# Preprocessing functions
def remove_unwanted_chars(df):
    return df.apply(lambda x: unidecode.unidecode(x))

def remove_numbers(df):
    return df.apply(lambda x: re.sub(r'\d+', '', x))

def tokenize_message(df):
    return df.apply(lambda x: word_tokenize(x))

def remove_stopwords(df):
    return df.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

def lemmatize_message(df):
    lemmatizer = WordNetLemmatizer()
    return df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [6]:
# Make FunctionTransformers for custom preprocessing functions
from sklearn.preprocessing import FunctionTransformer

preprocess_pipe = Pipeline([('removeunwanted', FunctionTransformer(remove_unwanted_chars)),
                            ('removenumbers', FunctionTransformer(remove_numbers)),
                            ('tokenize', FunctionTransformer(tokenize_message)),
                            ('removestopwords', FunctionTransformer(remove_stopwords)),
                            ('lemmatize', FunctionTransformer(lemmatize_message))])

## Classification model building

In [7]:
def change_to_float64(x):
    return x.astype('float64')

In [49]:
vectorizer = TfidfVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('preprocess', preprocess_pipe),
                 ('vec', vectorizer),
                 ('changetofloat64', FunctionTransformer(change_to_float64))])

X_train_transformed = pipe.fit_transform(X_train)

In [50]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_train_transformed = y_train_transformed.astype('float32')

### FLAML

In [59]:
# AutoML prediction
from flaml import AutoML
automl_clf = AutoML()
automl_clf.fit(X_train_transformed, y_train_transformed, task="classification", estimator_list=['lrl1'], time_budget=1000, verbose=3)


[flaml.automl: 04-10 19:55:18] {1926} INFO - task = classification
[flaml.automl: 04-10 19:55:18] {1928} INFO - Data split method: stratified
[flaml.automl: 04-10 19:55:18] {1932} INFO - Evaluation method: cv
[flaml.automl: 04-10 19:55:18] {973} INFO - class 12.0 augmented from 7 to 21
[flaml.automl: 04-10 19:55:18] {973} INFO - class 24.0 augmented from 12 to 24
[flaml.automl: 04-10 19:55:18] {973} INFO - class 32.0 augmented from 7 to 21
[flaml.automl: 04-10 19:55:18] {973} INFO - class 35.0 augmented from 7 to 21
[flaml.automl: 04-10 19:55:18] {1999} INFO - Minimizing error metric: log_loss
[flaml.automl: 04-10 19:55:18] {2051} INFO - List of ML learners in AutoML Run: ['lrl1']
[flaml.automl: 04-10 19:55:18] {2291} INFO - iteration 0, current learner lrl1
[flaml.automl: 04-10 19:55:33] {2404} INFO - Estimated sufficient time budget=145636s. Estimated necessary time budget=146s.
[flaml.automl: 04-10 19:55:33] {2479} INFO -  at 14.6s,	estimator lrl1's best error=0.4875,	best estimator

[flaml.automl: 04-10 20:06:13] {2291} INFO - iteration 35, current learner lrl1
[flaml.automl: 04-10 20:06:29] {2479} INFO -  at 670.7s,	estimator lrl1's best error=0.4601,	best estimator lrl1's best error=0.4601
[flaml.automl: 04-10 20:06:29] {2291} INFO - iteration 36, current learner lrl1
[flaml.automl: 04-10 20:06:46] {2479} INFO -  at 687.1s,	estimator lrl1's best error=0.4601,	best estimator lrl1's best error=0.4601
[flaml.automl: 04-10 20:06:46] {2291} INFO - iteration 37, current learner lrl1
[flaml.automl: 04-10 20:07:06] {2479} INFO -  at 707.7s,	estimator lrl1's best error=0.4601,	best estimator lrl1's best error=0.4601
[flaml.automl: 04-10 20:07:06] {2291} INFO - iteration 38, current learner lrl1
[flaml.automl: 04-10 20:07:27] {2479} INFO -  at 728.2s,	estimator lrl1's best error=0.4601,	best estimator lrl1's best error=0.4601
[flaml.automl: 04-10 20:07:27] {2291} INFO - iteration 39, current learner lrl1
[flaml.automl: 04-10 20:07:43] {2479} INFO -  at 744.4s,	estimator l

### Evaluate model

In [60]:
X_val_transformed = pipe.transform(X_val)

y_val_transformed = le.transform(y_val)
y_val_transformed = y_val_transformed.astype('float32')

In [61]:
y_pred = automl_clf.predict(X_val_transformed)

In [62]:
y_pred_original = le.inverse_transform(y_pred.astype('int32'))

In [63]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_val, y_pred_original))
print(classification_report(y_val, y_pred_original))

[[ 41   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0   0]
 [  0   0 181 ...   2   0   1]
 ...
 [  0   0   0 ...  69   0   0]
 [  0   0   0 ...   1   1   0]
 [  0   0   0 ...   0   0   4]]
                       precision    recall  f1-score   support

           bookflight       0.95      0.98      0.96        42
          changeorder       0.20      0.20      0.20         5
 changeseatassignment       0.90      0.94      0.92       192
         checkbalance       0.96      0.98      0.97        50
     checkclaimstatus       1.00      0.98      0.99        90
checkoffereligibility       0.55      1.00      0.71         6
    checkserverstatus       0.88      0.93      0.90        30
         closeaccount       0.82      0.70      0.76        20
        disputecharge       0.96      0.52      0.68        42
        expensereport       0.97      0.96      0.97        77
      getboardingpass       1.00      1.00      1.00       114
 getinformationintent       0.86      0.75      0.8

### Cross-validation with best model found by AutoML

In [64]:
best_clf = automl_clf.model.estimator

In [65]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(best_clf, X_train_transformed, y_train_transformed, scoring='adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

Scores: [0.8508, 0.8516, 0.8396, 0.8521, 0.8435]
Mean score: 0.8475
+/-2 std. dev. range within mean: (0.8374, 0.8576)


In [66]:
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score

ari = adjusted_rand_score(y_val, y_pred_original)
ami = adjusted_mutual_info_score(y_val, y_pred_original, average_method='arithmetic')

print("ARI: {}".format(ari))
print("AMI: {}".format(ami))

ARI: 0.8304811668049914
AMI: 0.8585691119849064


## Predictions on test data

In [67]:
df_test = pd.read_csv('input_data.csv')
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2906 non-null   int64 
 1   message  2906 non-null   object
dtypes: int64(1), object(1)
memory usage: 45.5+ KB


Unnamed: 0,id,message
0,12123,i have problem in excel
1,244,i need \t pesto drizzle over grilled chicken c...
2,8221,need to help order a new card as the old one e...
3,12856,i need internet plan
4,12108,my are report travel


In [68]:
X_test_transformed = pipe.transform(df_test['message'])

In [69]:
pred_test = best_clf.predict(X_test_transformed)

In [70]:
test_pred = le.inverse_transform(pred_test.astype('int32'))

In [71]:
my_submission = pd.DataFrame({'Id': df_test['id'], 'label': test_pred})
print(my_submission.head())

      Id               label
0  12123    orderdrinkintent
1    244   orderburgerintent
2   8221         replacecard
3  12856  startserviceintent
4  12108       expensereport


In [72]:
compression_opts = dict(method='zip', archive_name='coda_submission.csv')
my_submission.to_csv('coda_submission.zip', index=False, compression=compression_opts)