## Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("public_data.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11623 entries, 0 to 11622
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       11623 non-null  int64 
 1   message  11623 non-null  object
 2   label    11623 non-null  object
dtypes: int64(1), object(2)
memory usage: 272.5+ KB
None
      id                                            message  \
0   8793    hi i want change my address from my credit card   
1   3083  i need 4 fruit maple oatmeal 3 cold brew froze...   
2   5932        i wish to travel next month domestic airway   
3  12077                   i need reimbursement my expenses   
4   6608              i need a copy of insurance for my car   

                 label  
0        updateaddress  
1     orderdrinkintent  
2           bookflight  
3        expensereport  
4  getproofofinsurance  


In [3]:
X = df['message']
y = df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(9298,)
(9298,)
(2325,)
(2325,)


## Text cleaning and preprocessing

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import unidecode
import re

In [13]:
# remove weird characters from messages
import unidecode
X_train = X_train.apply(lambda x: unidecode.unidecode(x))

In [14]:
# remove numbers from messages
import re
X_train = X_train.apply(lambda x: re.sub(r'\d+', '', x))

In [15]:
# tokenize messages
#nltk.download('punkt')
X_train_tokens = X_train.apply(lambda x: word_tokenize(x))

In [16]:
# remove stopwords
#nltk.download('stopwords')

X_train_tokens = X_train_tokens.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

In [17]:
# lemmatize words
#nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
X_train = X_train_tokens.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [5]:
# Preprocessing functions
def remove_unwanted_chars(df):
    return df.apply(lambda x: unidecode.unidecode(x))

def remove_numbers(df):
    return df.apply(lambda x: re.sub(r'\d+', '', x))

def tokenize_message(df):
    return df.apply(lambda x: word_tokenize(x))

def remove_stopwords(df):
    return df.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

def lemmatize_message(df):
    lemmatizer = WordNetLemmatizer()
    return df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [6]:
# Make FunctionTransformers for custom preprocessing functions
from sklearn.preprocessing import FunctionTransformer

remove_unwanted_transformer = FunctionTransformer(remove_unwanted_chars)
remove_numbers_transformer = FunctionTransformer(remove_numbers)
tokenize_transformer = FunctionTransformer(tokenize_message)
remove_stopwords_transformer = FunctionTransformer(remove_stopwords)
lemmatize_transformer = FunctionTransformer(lemmatize_message)

In [7]:
preprocess_pipe = Pipeline([('removeunwanted', remove_unwanted_transformer),
                            ('removenumbers', remove_numbers_transformer),
                            ('tokenize', tokenize_transformer),
                            ('removestopwords', remove_stopwords_transformer),
                            ('lemmatize', lemmatize_transformer)])

In [10]:
transformed_X = preprocess_pipe.transform(X)

In [11]:
transformed_X

0                      hi want change address credit card
1        need fruit maple oatmeal cold brew frozen coffee
2                  wish travel next month domestic airway
3                              need reimbursement expense
4                                 need copy insurance car
                               ...                       
11618        boarding pas sent email address phone number
11619                                  need ticker bombay
11620                              want musical equipment
11621                                    need window seat
11622                       please money transfer account
Name: message, Length: 11623, dtype: object

In [18]:
X_train

3315     credit card missed please close card
11372                       need boarding pas
7985                 need proof insurance car
4075                      want change address
8127            hi want check seat assignemnt
                         ...                 
11284                        want new service
5191                         check cable bill
5390                           want know seat
860                         need agua frescas
7270                         need change seat
Name: message, Length: 9298, dtype: object

In [8]:
transformed_X_train = preprocess_pipe.transform(X_train)
transformed_X_train

3315     credit card missed please close card
11372                       need boarding pas
7985                 need proof insurance car
4075                      want change address
8127            hi want check seat assignemnt
                         ...                 
11284                        want new service
5191                         check cable bill
5390                           want know seat
860                         need agua frescas
7270                         need change seat
Name: message, Length: 9298, dtype: object

In [19]:
transformed_X[860]

'need agua frescas'

In [12]:
X

0          hi i want change my address from my credit card
1        i need 4 fruit maple oatmeal 3 cold brew froze...
2              i wish to travel next month domestic airway
3                         i need reimbursement my expenses
4                    i need a copy of insurance for my car
                               ...                        
11618    boarding pass to be sent to your email address...
11619                             i need ticker for bombay
11620                             i want musical equipment
11621                                   i need window seat
11622                please money transfer from my account
Name: message, Length: 11623, dtype: object

## Classification model building

In [11]:
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('preprocess', preprocess_pipe),
                 ('vec', vectorizer),
                 ('clf', RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42))])

pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('removeunwanted',
                                  FunctionTransformer(func=<function remove_unwanted_chars at 0x0000027C0FCB7D30>)),
                                 ('removenumbers',
                                  FunctionTransformer(func=<function remove_numbers at 0x0000027C0FCB7CA0>)),
                                 ('tokenize',
                                  FunctionTransformer(func=<function tokenize_message at 0x0000027C0FCB7EE0>)),
                                 ('removestopwords',
                                  FunctionTransformer(func=<function remove_stopwords at 0x0000027C0FCB7F70>)),
                                 ('lemmatize',
                                  FunctionTransformer(func=<function lemmatize_message at 0x0000027C0FCC3040>))])),
                ('vec',
                 CountVectorizer(max_df=0.8, max_features=300, min_df=0.01,
                                 ngram_range=[1, 1], 

In [15]:
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('vec', vectorizer),  ('clf', RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42))])

pipe.fit(X_train, y_train)

Pipeline(steps=[('vec',
                 CountVectorizer(max_df=0.8, max_features=300, min_df=0.01,
                                 ngram_range=[1, 1], stop_words='english')),
                ('clf',
                 RandomForestClassifier(n_estimators=200, n_jobs=-1,
                                        random_state=42))])

### Model evaluation

In [11]:
X_val = X_val.apply(lambda x: unidecode.unidecode(x))
X_val = X_val.apply(lambda x: re.sub(r'\d+', '', x))
X_val = X_val.apply(lambda x: word_tokenize(x))
X_val = X_val.apply(lambda x: [word for word in x if word not in stopwords.words('english')])
X_val = X_val.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [12]:
from sklearn.metrics import confusion_matrix, classification_report

pred_val = pipe.predict(X_val)
print(confusion_matrix(y_val, pred_val))
print(classification_report(y_val, pred_val))

[[ 41   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0   0]
 [  0   0 180 ...   0   0   0]
 ...
 [  0   0   0 ...  69   0   0]
 [  1   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   7]]
                       precision    recall  f1-score   support

           bookflight       0.91      0.98      0.94        42
          changeorder       0.50      0.20      0.29         5
 changeseatassignment       0.90      0.94      0.92       192
         checkbalance       0.92      0.98      0.95        50
     checkclaimstatus       1.00      0.98      0.99        90
checkoffereligibility       0.50      1.00      0.67         6
    checkserverstatus       0.84      0.90      0.87        30
         closeaccount       0.88      0.75      0.81        20
        disputecharge       0.90      0.45      0.60        42
        expensereport       0.97      0.97      0.97        77
      getboardingpass       1.00      1.00      1.00       114
 getinformationintent       0.86      0.75      0.8

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Cross-validation

In [17]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(pipe, X_train, y_train, scoring = 'adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

Scores: [0.8463, 0.8465, 0.8352, 0.8366, 0.8455]
Mean score: 0.842
+/-2 std. dev. range within mean: (0.8320, 0.8521)


In [13]:
from sklearn.model_selection import cross_val_score

ami_scores = cross_val_score(pipe, X_train, y_train, scoring = 'adjusted_mutual_info_score', cv=5)

# Calculate mean and standard deviation of scores
avg_ami = ami_scores.mean()
stddev_ami = ami_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in ami_scores])
print(f"Mean score: {round(avg_ami, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg_ami - 2*stddev_ami:.4f}, {avg_ami + 2*stddev_ami:.4f})")

Scores: [0.8463, 0.8465, 0.8352, 0.8366, 0.8455]
Mean score: 0.842
+/-2 std. dev. range within mean: (0.8320, 0.8521)


In [14]:
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score

ari = adjusted_rand_score(y_val, pred_val)
ami = adjusted_mutual_info_score(y_val, pred_val, average_method='arithmetic')

print("ARI: {}".format(ari))
print("AMI: {}".format(ami))

ARI: 0.8379420744646501
AMI: 0.8559183800359201


## Predictions on test data

In [15]:
df_test = pd.read_csv('input_data.csv')
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2906 non-null   int64 
 1   message  2906 non-null   object
dtypes: int64(1), object(1)
memory usage: 45.5+ KB


Unnamed: 0,id,message
0,12123,i have problem in excel
1,244,i need \t pesto drizzle over grilled chicken c...
2,8221,need to help order a new card as the old one e...
3,12856,i need internet plan
4,12108,my are report travel


In [16]:
#X_test = df_test['message']
pred_test = pipe.predict(df_test['message'])

In [17]:
my_submission = pd.DataFrame({'Id': df_test['id'], 'label': pred_test})
print(my_submission.head())

      Id               label
0  12123    orderdrinkintent
1    244   orderburgerintent
2   8221         replacecard
3  12856  startserviceintent
4  12108       expensereport


In [18]:
my_submission.to_csv('coda_submission.csv', index=False)
# REMEMBER TO ZIP csv file BEFORE SUBMITTING!!!

In [19]:
from zipfile import ZipFile

zipObj = ZipFile('coda_submission.zip', 'w')
zipObj.write('coda_submission.csv')
zipObj.close()