<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Implementation of Gradient Boosting</p>

### Importing Required Packages

In [35]:
import nltk
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split, GridSearchCV

### Loading and Preparing the Data

In [7]:
nltk.data.path.append(r"D:\Artificial_Intelligence\nat_lang_proc\nltk_data")
stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()
data_df = pd.read_csv(r"D:/Artificial_Intelligence/nat_lang_proc/data/SMSSpamCollection.tsv", 
                      delimiter="\t", 
                      header=None)
data_df.columns = ["labels", "body_text"]

In [8]:
# Adding two new features
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

data_df["body_len"] = data_df["body_text"].apply(lambda x:len(x) - x.count(" "))
data_df["punct%"] = data_df["body_text"].apply(lambda x: count_punct(x))
data_df.head()

Unnamed: 0,labels,body_text,body_len,punct%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [9]:
# Cleaning and Vectorizing the data using TF-IDF vectorizer
def clean_data(text):
    without_punc = "".join([char.lower() for char in text if char not in string.punctuation])
    tokenzied_text = re.findall("\w+", without_punc)
    stemmed_tokens = [ps.stem(word) for word in tokenzied_text if word not in stopwords]
    return stemmed_tokens
tfidf_vect = TfidfVectorizer(analyzer=clean_data)
X_tfidf = tfidf_vect.fit_transform(data_df["body_text"])
X_dfidf_df = pd.DataFrame(X_tfidf.toarray())
X_dfidf_df.columns = tfidf_vect.get_feature_names_out()
X_features = pd.concat([data_df["body_len"], data_df["punct%"], X_dfidf_df], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model Training

In [17]:
result_df = pd.DataFrame(columns=["estimators", 
                                  "max_depth",
                                  "learning_rate", 
                                  "precision", 
                                  "recall", 
                                  "accuracy"])
result_df

Unnamed: 0,estimators,max_depth,learning_rate,precision,recall,accuracy


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data_df["labels"], test_size=0.2)

In [27]:
def train_GB(est: int, max_depth: int, lr: float | int) -> None:
    """
    Trains the Gradient Booster model and prints the results to the console.
    
    :param est: Number of estimators(number of decision trees)
    :type  est: int
    
    :param max_depth: Maximum depth of the tree
    :type  max_depth: int
    
    :param lr: Learning rate of the model
    :type  lr: float or int
    
    :returns: None
    :rtype  : None
    """
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label="spam", average="binary")
    accuracy = round((y_pred==y_test).sum()/len(y_pred), 3)
    result_df.loc[len(result_df)] = [est, max_depth, lr, round(precision, 3), round(recall, 3),  accuracy]

In [28]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
result_df.sort_values(by=["accuracy", "precision", "recall"], ascending=False)

Unnamed: 0,estimators,max_depth,learning_rate,precision,recall,accuracy
34,150.0,15.0,0.1,0.958,0.895,0.98
31,150.0,11.0,0.1,0.951,0.901,0.98
22,100.0,15.0,0.1,0.951,0.895,0.979
23,100.0,15.0,1.0,0.977,0.855,0.978
35,150.0,15.0,1.0,0.97,0.862,0.978
28,150.0,7.0,0.1,0.957,0.875,0.978
19,100.0,11.0,0.1,0.944,0.895,0.978
20,100.0,11.0,1.0,0.97,0.855,0.977
7,50.0,11.0,0.1,0.95,0.875,0.977
10,50.0,15.0,0.1,0.95,0.875,0.977


### Gradient Boosting with GridSearchCV

In [36]:
X_tfidf_features = X_features

count_vect = CountVectorizer(analyzer=clean_data)
X_count_vect = count_vect.fit_transform(data_df["body_text"])
X_count_vect_df = pd.DataFrame(X_count_vect.toarray())
X_count_vect_df.columns = count_vect.get_feature_names_out()
X_count_vect_features = pd.concat([data_df["body_len"], data_df["punct%"], X_count_vect_df], axis=1)
X_count_vect_features.head()

Unnamed: 0,body_len,punct%,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
gb = GradientBoostingClassifier()
params = {
    "n_estimators": [100, 150],
    "max_depth": [7, 11, 15],
    "learning_rate": [0.1]
}

gs = GridSearchCV(gb, params, cv=5, n_jobs=-1)

tfidf_cv_fit = gs.fit(X_tfidf_features, data_df["labels"])
pd.DataFrame(tfidf_cv_fit.cv_results_).sort_values("mean_test_score", ascending=False)[0:5]

In [None]:
# Count vectorizer data check
count_vect_cv_fit = gs.fit(X_count_vect_features, data_df["labels"])
pd.DataFrame(count_vect_cv_fit.cv_results_).sort_values("mean_test_score", ascending=False)[0:5]