## Load data

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("public_data.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11623 entries, 0 to 11622
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       11623 non-null  int64 
 1   message  11623 non-null  object
 2   label    11623 non-null  object
dtypes: int64(1), object(2)
memory usage: 272.5+ KB
None
      id                                            message  \
0   8793    hi i want change my address from my credit card   
1   3083  i need 4 fruit maple oatmeal 3 cold brew froze...   
2   5932        i wish to travel next month domestic airway   
3  12077                   i need reimbursement my expenses   
4   6608              i need a copy of insurance for my car   

                 label  
0        updateaddress  
1     orderdrinkintent  
2           bookflight  
3        expensereport  
4  getproofofinsurance  


In [3]:
X = df['message']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(9298,)
(9298,)
(2325,)
(2325,)


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import unidecode
import re

In [5]:
# Preprocessing functions
def remove_unwanted_chars(df):
    return df.apply(lambda x: unidecode.unidecode(x))

def remove_numbers(df):
    return df.apply(lambda x: re.sub(r'\d+', '', x))

def tokenize_message(df):
    return df.apply(lambda x: word_tokenize(x))

def remove_stopwords(df):
    return df.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

def lemmatize_message(df):
    lemmatizer = WordNetLemmatizer()
    return df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [6]:
# Make FunctionTransformers for custom preprocessing functions
from sklearn.preprocessing import FunctionTransformer

preprocess_pipe = Pipeline([('removeunwanted', FunctionTransformer(remove_unwanted_chars)),
                            ('removenumbers', FunctionTransformer(remove_numbers)),
                            ('tokenize', FunctionTransformer(tokenize_message)),
                            ('removestopwords', FunctionTransformer(remove_stopwords)),
                            ('lemmatize', FunctionTransformer(lemmatize_message))])

## Classification model building

### XGBoost

In [7]:
from xgboost import XGBClassifier

vectorizer = CountVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('preprocess', preprocess_pipe),
                 ('vec', vectorizer),
                 ('clf', XGBClassifier(random_state=42))])

pipe.fit(X_train, y_train)



Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('removeunwanted',
                                  FunctionTransformer(func=<function remove_unwanted_chars at 0x0000018F187E9CA0>)),
                                 ('removenumbers',
                                  FunctionTransformer(func=<function remove_numbers at 0x0000018F187E9C10>)),
                                 ('tokenize',
                                  FunctionTransformer(func=<function tokenize_message at 0x0000018F187E9E50>)),
                                 ('removestopwords',
                                  FunctionTrans...
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_j

### Model evaluation

In [8]:
from sklearn.metrics import confusion_matrix, classification_report

pred_val = pipe.predict(X_val)
print(confusion_matrix(y_val, pred_val))
print(classification_report(y_val, pred_val))

[[ 41   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0   0]
 [  0   0 181 ...   0   0   0]
 ...
 [  0   0   0 ...  68   0   0]
 [  0   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   7]]
                       precision    recall  f1-score   support

           bookflight       0.95      0.98      0.96        42
          changeorder       0.50      0.20      0.29         5
 changeseatassignment       0.89      0.94      0.91       192
         checkbalance       1.00      0.98      0.99        50
     checkclaimstatus       1.00      0.98      0.99        90
checkoffereligibility       0.50      1.00      0.67         6
    checkserverstatus       0.84      0.90      0.87        30
         closeaccount       0.82      0.70      0.76        20
        disputecharge       0.96      0.55      0.70        42
        expensereport       0.95      0.97      0.96        77
      getboardingpass       1.00      1.00      1.00       114
 getinformationintent       0.86      0.78      0.8

### Cross-validation

In [9]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(pipe, X_train, y_train, scoring = 'adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

Scores: [0.8466, 0.8487, 0.8393, 0.8455, 0.8405]
Mean score: 0.8441
+/-2 std. dev. range within mean: (0.8369, 0.8514)


In [10]:
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score

ari = adjusted_rand_score(y_val, pred_val)
ami = adjusted_mutual_info_score(y_val, pred_val, average_method='arithmetic')

print("ARI: {}".format(ari))
print("AMI: {}".format(ami))

ARI: 0.8311191485746243
AMI: 0.8575256181594058


### CatBoost

In [23]:
catboost_params = {
    'iterations': 3000,
    'learning_rate': 0.01,
    'verbose': 500,
    'random_state': 42
}

In [24]:
from catboost import CatBoostClassifier

vectorizer = CountVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('preprocess', preprocess_pipe),
                 ('vec', vectorizer),
                 ('clf', CatBoostClassifier(**catboost_params))])

pipe.fit(X_train, y_train)

0:	learn: 3.5009771	total: 58.9ms	remaining: 2m 56s
500:	learn: 0.8793580	total: 29.5s	remaining: 2m 27s
1000:	learn: 0.6219616	total: 1m	remaining: 2m 1s
1500:	learn: 0.5250558	total: 1m 29s	remaining: 1m 29s
2000:	learn: 0.4803688	total: 1m 58s	remaining: 59.4s
2500:	learn: 0.4542069	total: 2m 28s	remaining: 29.6s
2999:	learn: 0.4368748	total: 2m 57s	remaining: 0us


Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('removeunwanted',
                                  FunctionTransformer(func=<function remove_unwanted_chars at 0x0000018F187E9CA0>)),
                                 ('removenumbers',
                                  FunctionTransformer(func=<function remove_numbers at 0x0000018F187E9C10>)),
                                 ('tokenize',
                                  FunctionTransformer(func=<function tokenize_message at 0x0000018F187E9E50>)),
                                 ('removestopwords',
                                  FunctionTransformer(func=<function remove_stopwords at 0x0000018F187E9EE0>)),
                                 ('lemmatize',
                                  FunctionTransformer(func=<function lemmatize_message at 0x0000018F187E9F70>))])),
                ('vec',
                 CountVectorizer(max_df=0.8, max_features=300, min_df=0.01,
                                 ngram_range=[1, 1], 

### Model evaluation

In [42]:
from sklearn.metrics import confusion_matrix, classification_report

pred_val = pipe.predict(X_val)
pred_val = pred_val.reshape(-1)
print(confusion_matrix(y_val, pred_val))
print(classification_report(y_val, pred_val))

[[ 41   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0   0]
 [  0   0 180 ...   2   0   2]
 ...
 [  0   0   0 ...  68   0   0]
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   4]]
                       precision    recall  f1-score   support

           bookflight       0.95      0.98      0.96        42
          changeorder       0.50      0.20      0.29         5
 changeseatassignment       0.89      0.94      0.91       192
         checkbalance       0.94      0.98      0.96        50
     checkclaimstatus       1.00      0.98      0.99        90
checkoffereligibility       0.55      1.00      0.71         6
    checkserverstatus       0.87      0.90      0.89        30
         closeaccount       0.88      0.70      0.78        20
        disputecharge       0.95      0.45      0.61        42
        expensereport       0.97      0.96      0.97        77
      getboardingpass       1.00      1.00      1.00       114
 getinformationintent       0.89      0.75      0.8

### Cross-validation

In [26]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(pipe, X_train, y_train, scoring='adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

0:	learn: 3.5026492	total: 44.4ms	remaining: 2m 13s
500:	learn: 0.8959569	total: 22.9s	remaining: 1m 54s
1000:	learn: 0.6337431	total: 45.5s	remaining: 1m 30s
1500:	learn: 0.5356966	total: 1m 8s	remaining: 1m 7s
2000:	learn: 0.4888768	total: 1m 30s	remaining: 45s
2500:	learn: 0.4619300	total: 1m 52s	remaining: 22.5s
2999:	learn: 0.4432763	total: 2m 15s	remaining: 0us
0:	learn: 3.5030591	total: 47.6ms	remaining: 2m 22s
500:	learn: 0.8922016	total: 23.5s	remaining: 1m 57s
1000:	learn: 0.6283730	total: 46.7s	remaining: 1m 33s
1500:	learn: 0.5339347	total: 1m 9s	remaining: 1m 9s
2000:	learn: 0.4890037	total: 1m 33s	remaining: 46.4s
2500:	learn: 0.4620658	total: 1m 56s	remaining: 23.3s
2999:	learn: 0.4432200	total: 2m 19s	remaining: 0us
0:	learn: 3.5022505	total: 44.9ms	remaining: 2m 14s
500:	learn: 0.8918655	total: 22.2s	remaining: 1m 50s
1000:	learn: 0.6354472	total: 44.3s	remaining: 1m 28s
1500:	learn: 0.5392623	total: 1m 6s	remaining: 1m 6s
2000:	learn: 0.4929177	total: 1m 28s	remaining

In [43]:
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score

ari = adjusted_rand_score(y_val, pred_val)
ami = adjusted_mutual_info_score(y_val, pred_val, average_method='arithmetic')

print("ARI: {}".format(ari))
print("AMI: {}".format(ami))

ARI: 0.809362104906277
AMI: 0.8562792951515682


## LGBM

https://stackoverflow.com/questions/50250432/python-lightgbm-text-classicication-with-tfidf

In [7]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_train_transformed = y_train_transformed.astype('float32')

In [21]:
def change_to_float64(x):
    return x.astype('float64')

In [39]:
from lightgbm import LGBMClassifier

vectorizer = CountVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('preprocess', preprocess_pipe),
                 ('vec', vectorizer),
                 ('changetofloat64', FunctionTransformer(change_to_float64)),
                 ('clf', LGBMClassifier())])

pipe.fit(X_train, y_train_transformed)

Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('removeunwanted',
                                  FunctionTransformer(func=<function remove_unwanted_chars at 0x000002064579CDC0>)),
                                 ('removenumbers',
                                  FunctionTransformer(func=<function remove_numbers at 0x000002064579CD30>)),
                                 ('tokenize',
                                  FunctionTransformer(func=<function tokenize_message at 0x000002064579CF70>)),
                                 ('removestopwords',
                                  FunctionTrans...e_stopwords at 0x00000206457AE040>)),
                                 ('lemmatize',
                                  FunctionTransformer(func=<function lemmatize_message at 0x00000206457AE0D0>))])),
                ('vec',
                 CountVectorizer(max_df=0.8, max_features=300, min_df=0.01,
                                 ngram_range=[1, 1], stop_words='english')),


### Model evaluation

In [40]:
from sklearn.metrics import confusion_matrix, classification_report

# Convert predictions back to classifcations in text from encoded labels for LGBM
pred_val = pipe.predict(X_val)
pred_val = pred_val.astype('int32')
pred_val = le.inverse_transform(pred_val)

print(confusion_matrix(y_val, pred_val))
print(classification_report(y_val, pred_val))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]
                       precision    recall  f1-score   support

           bookflight       0.00      0.00      0.00        42
          changeorder       0.00      0.00      0.00         5
 changeseatassignment       0.00      0.00      0.00       192
         checkbalance       0.00      0.00      0.00        50
     checkclaimstatus       0.17      0.04      0.07        90
checkoffereligibility       0.00      0.00      0.00         6
    checkserverstatus       0.00      0.00      0.00        30
         closeaccount       0.00      0.00      0.00        20
        disputecharge       0.00      0.00      0.00        42
        expensereport       0.00      0.00      0.00        77
      getboardingpass       0.00      0.00      0.00       114
 getinformationintent       0.00      0.00      0.00        32
        getpromotions       0.00      0.00      0.00        

### Cross-validation

In [41]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(pipe, X_train, y_train, scoring='adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

Scores: [0.1068, 0.2075, 0.8298, 0.8394, 0.8396]
Mean score: 0.5646
+/-2 std. dev. range within mean: (-0.1039, 1.2331)


In [42]:
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score

ari = adjusted_rand_score(y_val, pred_val)
ami = adjusted_mutual_info_score(y_val, pred_val, average_method='arithmetic')

print("ARI: {}".format(ari))
print("AMI: {}".format(ami))

ARI: 0.0009016941096306044
AMI: 0.019671861169584


## Predictions on test data

In [34]:
df_test = pd.read_csv('input_data.csv')
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2906 non-null   int64 
 1   message  2906 non-null   object
dtypes: int64(1), object(1)
memory usage: 45.5+ KB


Unnamed: 0,id,message
0,12123,i have problem in excel
1,244,i need \t pesto drizzle over grilled chicken c...
2,8221,need to help order a new card as the old one e...
3,12856,i need internet plan
4,12108,my are report travel


In [43]:
pred_test = pipe.predict(df_test['message'])

In [44]:
# Convert predictions back to classifcations in text from encoded labels for LGBM
pred_test = pred_test.astype('int32')
pred_test = le.inverse_transform(pred_test)

In [38]:
pred_test = pred_test.reshape(-1) # reshape to 1D for predictions with Catboost model

In [45]:
my_submission = pd.DataFrame({'Id': df_test['id'], 'label': pred_test})
print(my_submission.head())

      Id             label
0  12123  orderpizzaintent
1    244  orderpizzaintent
2   8221  orderpizzaintent
3  12856  orderpizzaintent
4  12108  orderpizzaintent


In [None]:
compression_opts = dict(method='zip', archive_name='coda_submission.csv')
my_submission.to_csv('coda_submission.zip', index=False, compression=compression_opts)