In [359]:
import warnings
warnings.filterwarnings('ignore')

In [360]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score

## Setup

In [361]:
df = pd.read_csv("./data/SMS.tsv", delimiter='\t')
df["class"] = LabelEncoder().fit_transform(df["class"])
df.head()

Unnamed: 0,class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [362]:
def process(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    text = re.sub(r'\s\s+', ' ', text)
    text = re.sub(r'[\d_]', '', text)

    return text.strip()

In [363]:
X = df.text.map(process)
y = df["class"]

In [364]:
nltk.download("stopwords", quiet=True)
tfidf = TfidfVectorizer(max_features=1000, stop_words=stopwords.words("english"), strip_accents="unicode")
X = pd.DataFrame(data=tfidf.fit_transform(X).toarray(), columns=tfidf.get_feature_names_out())
feats = tfidf.get_feature_names_out()

In [365]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Embedding

In [366]:
from sklearn.ensemble import RandomForestClassifier

imp = RandomForestClassifier().fit(X, y).feature_importances_
embed_res = list(map(lambda i_f: i_f[1], sorted(filter(lambda i_f: i_f[0] > np.mean(imp), zip(imp, feats)))[-30:][::-1]))

## Filter

In [367]:
filter_res = list(map(lambda x: x[0], sorted([(col, np.abs(np.corrcoef(X[col], y))[0][1]) for col in X.columns], key=lambda x: x[1])[-30:][::-1]))

## Wrapper

In [368]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

wrap_res = []
while len(wrap_res) < 30:
    best_res = (0, "")
    for feat in X_train:
        if feat in wrap_res: continue

        best_so_far_train = pd.DataFrame(X_train, columns=wrap_res)
        best_so_far_test = pd.DataFrame(X_test, columns=wrap_res)

        best_so_far_train[feat] = X_train[feat]
        best_so_far_test[feat] = X_test[feat]

        classifier.fit(best_so_far_train.values, y_train)

        best_res = max(best_res, (accuracy_score(y_test, classifier.predict(best_so_far_test.values)), feat), key=lambda x: x[0])

    wrap_res.append(best_res[1])

## Lib methods

In [369]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

mutin = SelectKBest(mutual_info_classif, k=30)
mutin.fit(X_train, y_train)
lib_mutin_res = feats[mutin.get_support()]

In [370]:
from sklearn.feature_selection import f_classif

fclass = SelectKBest(f_classif, k=30)
fclass.fit(X_train, y_train)
lib_fclass_res = feats[fclass.get_support()]

In [371]:
from sklearn.feature_selection import chi2

chitwo = SelectKBest(chi2, k=30)
chitwo.fit(X_train, y_train)
lib_chitwo_res = feats[chitwo.get_support()]

## Display

In [372]:
print(f"{'Embedding': >16}   {'Filter': >16}   {'Wrapper': >16}   {'Chi2': >16}   {'F Classification': >16}   {'Mut. Inf. Classif': >16}")
for i in range(30): print(f"{embed_res[i]: >16} | {filter_res[i]: >16} | {wrap_res[i]: >16} | {lib_chitwo_res[i]: >16} | {lib_fclass_res[i]: >16} | {lib_mutin_res[i]: >16}")

       Embedding             Filter            Wrapper               Chi2   F Classification   Mut. Inf. Classif
            call |              txt |            claim |            award |          awarded |              box
             txt |            claim |              txt |          awarded |              box |             call
          mobile |             free |             call |              box |             call |             cash
            free |            prize |           mobile |             call |           camera |            claim
           claim |           mobile |          service |           camera |             cash |  congratulations
           prize |             call |             ltgt |             cash |            claim |               cs
            text |           urgent |           reward |            claim |       collection |         customer
            stop |             stop |            entry |          contact |          contact |         

## Classifier Test

In [373]:
def test_classifier(clsf):
    print(f"Testing {clsf.__class__.__name__}")
    for feats, name in [(None, 'Default'), (embed_res, 'Embedding'), (filter_res, 'Filter'), (wrap_res, 'Wrapper'), (lib_chitwo_res, 'Chi 2'), (lib_fclass_res, 'F Classif'), (lib_mutin_res, 'Mutual Inf')]:
        if feats is None:
            clsf.fit(X_train, y_train)
            print(f"{name: >10}: {accuracy_score(y_test, clsf.predict(X_test))*100:.3f}%")
            continue
            
        clsf.fit(pd.DataFrame(X_train, columns=feats).values, y_train)
        print(f"{name: >10}: {accuracy_score(y_test, clsf.predict(pd.DataFrame(X_test, columns=feats).values))*100:.3f}%")
    print()


In [375]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

test_classifier(KNeighborsClassifier())
test_classifier(GaussianNB())
test_classifier(RandomForestClassifier())
test_classifier(DecisionTreeClassifier())

Testing KNeighborsClassifier
   Default: 93.108%
 Embedding: 95.980%
    Filter: 95.334%
   Wrapper: 94.688%
     Chi 2: 95.334%
 F Classif: 95.047%
Mutual Inf: 94.544%

Testing GaussianNB
   Default: 79.971%
 Embedding: 94.903%
    Filter: 94.185%
   Wrapper: 22.685%
     Chi 2: 94.257%
 F Classif: 94.113%
Mutual Inf: 93.037%

Testing RandomForestClassifier
   Default: 98.062%
 Embedding: 96.554%
    Filter: 95.765%
   Wrapper: 95.693%
     Chi 2: 95.549%
 F Classif: 95.980%
Mutual Inf: 95.406%

Testing DecisionTreeClassifier
   Default: 95.477%
 Embedding: 95.118%
    Filter: 94.401%
   Wrapper: 95.334%
     Chi 2: 94.472%
 F Classif: 94.616%
Mutual Inf: 94.688%

