# <div style="text-align: center; background-color: #0C6B44; font-family:newtimeroman; color: white; padding: 14px; line-height: 1;border-radius:20px">📊**Forecasting on Data Science classify emails Prediction Dataset**</div>


# Libraries import

In [1]:
# importing needed packages here

import os
import re
import spacy
import hashlib
import numpy as np
import pandas as pd
import nltk
from tqdm import tqdm
from collections import Counter
from spacy.matcher import Matcher
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from tqdm import tqdm 
cpu_count = int(os.cpu_count()) if os.cpu_count() != None else 4

# EDA

In [2]:
mapping = pd.read_csv('data/mapping.csv')


In [3]:
dataset = pd.read_csv('data/dataset.csv')
dataset = dataset[dataset.language=='en']
dataset = pd.merge(dataset, mapping, how='left')
dataset = dataset[~dataset["class"].isna()]
#dataset.isna().sum()
dataset

Unnamed: 0,text,date,category,language,class
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,visa_or_mastercard,en,card
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,reverted_card_payment?,en,card
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,extra_charge_on_statement,en,others
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,transfer_timing,en,transfer
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,transfer_timing,en,transfer
...,...,...,...,...,...
9068,"good afternoon, I think someone may be using m...",22-6-2022,compromised_card,en,card
9069,"good morning, Help, I need to top up my accoun...",7-4-2022,top_up_by_cash_or_cheque,en,cash
9070,"hi, I made an international purchasee, but the...",7-12-2022,card_payment_wrong_exchange_rate,en,card
9071,"hi, Why is my card not working anymore? Thanks",1-11-2022,card_not_working,en,card


# Setting methods for preprocessing 

In [4]:
tokenizer = WordPunctTokenizer()
nlp = spacy.load('en_core_web_sm')
docs = list(tqdm(nlp.pipe(dataset['text']),total=len(dataset['text'])))
en_stopwords = set(nltk.corpus.stopwords.words('english'))

def remove_punctuation(text):
    text = re.sub(r"[^\w\d\s]", "", text)
    return text.lower()

def remove_stopwords(text, stopwords):
    words = [word for word in tokenizer.tokenize(text) if word not in stopwords]
    text_processed = " ".join(words)
    return text_processed
def apply_stemmer (text,stemmer):
    words = [stemmer.stem(word) for word in text.split() ]
    return ' '.join(words)
def regex (text):
    
    return re.sub(r'(Kind Regards|kind Regards|Best Regards|thanks|best Regards|Thanks)$','',re.sub(r'^[\w\s]+\,','',text))

100%|██████████████████████████████████████████████████████████████████████████████| 9038/9038 [00:23<00:00, 384.88it/s]


In [5]:
def preprocess_text(df):   
    df_processed = df.copy()  
    df_processed["text"] = df_processed["text"].apply(regex)
    df_processed["text"] = df_processed["text"].apply(remove_punctuation)
#     df_processed["text"] = df_processed["text"].apply(remove_stopwords, stopwords = en_stopwords)   
#     df_processed["text"] = df_processed["text"].apply(apply_stemmer, stemmer = SnowballStemmer("english", ignore_stopwords=True))   
    return df_processed

dataset_processed = preprocess_text(dataset)
dataset_processed.drop(columns = ['category','language','date'],inplace=True)
dataset_processed

Unnamed: 0,text,class
0,does it matter iff i use visa or mastercard,card
1,i just got refunded for my purchase over two ...,card
2,i got billed ann extra pound,others
3,how long does it take for a transfer to show ...,transfer
4,when can i use money sent to my accountt,transfer
...,...,...
9068,i think someone may be using my card,card
9069,help i need to top up my account where do i s...,cash
9070,i made an international purchasee but the exc...,card
9071,why is my card not working anymore,card


# Best model 

In [6]:
def baseline_with_tfidf(X_train, X_test, y_train, y_test):

    pipe = Pipeline([('tfidf', TfidfVectorizer()),
                     ('classifier', SVC(C=5))])
#                     ('classifier', RandomForestClassifier(random_state=42))])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    f1s = f1_score(y_pred, y_test, average='macro')
    return pipe, f1s

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset_processed["text"], dataset_processed["class"], 
                                                    test_size=0.2, random_state=42, stratify=dataset_processed["class"])

baseline_model, baseline_f1s = baseline_with_tfidf(X_train, X_test, y_train, y_test)

# Evaluation

In [8]:
baseline_f1s

0.9398225033267688

# Feature Engineering

In [9]:
matcher= Matcher(nlp.vocab)
matcher.add('ADJ',[[{"POS":'ADJ'}]])
matcher.add('ADV',[[{"POS":'ADV'}]])
nb_adj_adv = [len(matcher(doc)) for doc in docs]

In [10]:
dataset_processed["nb_words"] = dataset_processed['text'].str.split().map(len)
dataset_processed["doc_length"] = dataset_processed['text'].map(len)
dataset_processed["nb_adj_adv"] = nb_adj_adv
dataset_processed["avg_word_length"] = dataset_processed['text'].apply(lambda x: np.mean([len(t) for t in x.split() ]) if len([len(t) for t in x.split()]) > 0 else 0)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(dataset_processed.drop(columns= ["class"]), dataset_processed["class"], 
                                                    test_size=0.2, random_state=42, stratify=dataset_processed["class"])

# Making Feature union 

In [12]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]
    
    

# Evaluation 

In [13]:
text_pipe = Pipeline([('selector', TextSelector("text")),
    ('tfidf',TfidfVectorizer())])


nb_words_pipe = Pipeline([('selector', NumberSelector("nb_words")),
                ('standard', StandardScaler())])
doc_length_pipe = Pipeline([('selector', NumberSelector("doc_length")),
                ('standard', StandardScaler())])
avg_word_length_pipe = Pipeline([('selector', NumberSelector("avg_word_length")),
                ('standard', StandardScaler())])
feats = FeatureUnion([('text', text_pipe), 
                     ])
combined_pipe = Pipeline([
    ('feats',feats),
    ('clf',  SVC(C=20)),
    ])
combined_pipe.fit(X_train,y_train)
pred= combined_pipe.predict(X_test)
score = f1_score(y_test,pred,average='macro')
score

0.9388193593474707