In [85]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from itertools import chain, product
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

In [86]:
# Load, format and encode training data
df = pd.read_csv('data.csv')

# Gets n-grams from a word, does not include first and last letter
def get_ngrams(word, n):
      return {f"{n}_" + word[i:i+n] for i in range(1,len(word) - n) if n <= len(word)}

# Gets n-grams from a series
def get_all_ngrams(words, n):
    return sorted(list(set().union(*[get_ngrams(word, n) for word in words])))

# Define feature ranges
ngram_range = range(1,11)
suffix_range = range(1,6)
prefix_range = range(1,6)

# Precomputes n-grams, suffixes, and prefixes appearing in the dataset
ngrams = {n: get_all_ngrams(df["word"], n) 
                for n in ngram_range}
suffixes = {s: sorted(list({f"s{s}_" + word[-s:] for word in df["word"]})) 
                for s in suffix_range}
prefixes = {p: sorted(list({f"p{p}_" + word[ :p] for word in df["word"]})) 
                for p in prefix_range}

# One hot encode features in dataframes
ngram_df = pd.DataFrame(
    [{ngram : 1 if ngram in get_ngrams(word, n) else 0 
      for n in ngram_range for ngram in ngrams[n]}
        for word in df["word"]])
suffix_df = pd.DataFrame(
    [{suffix : 1 if suffix == f"s{n}_" + word[-n:] else 0 
      for n in suffix_range for suffix in suffixes[n]}
        for word in df["word"]])
prefix_df = pd.DataFrame(
    [{prefix : 1 if prefix == f"p{n}_" + word[:n] else 0 
      for n in prefix_range for prefix in prefixes[n]}
        for word in df["word"]])

# Stores feature indices for fast feature selection
ngram_ixs = dict()
suffix_ixs = dict()
prefix_ixs = dict()
start_i = 0

for n in ngram_range:
  ngram_ixs[n] = np.arange(start_i, start_i + len(ngrams[n]))
  start_i += len(ngrams[n])

for s in suffix_range:
    suffix_ixs[s] = np.arange(start_i, start_i + len(suffixes[s]))
    start_i += len(suffixes[s])

for p in prefix_range:
  prefix_ixs[p] = np.arange(start_i, start_i + len(prefixes[p]))
  start_i += len(prefixes[p])

# Format design matrix and labels
X = pd.concat([ngram_df, suffix_df, prefix_df], axis = 1)
y = df["label"].map({"french": 1, "spanish":-1})

In [87]:
n_fren = df[df["label"] == 'french'].shape[0]
n_span = df[df["label"] == 'spanish'].shape[0]
num_features = 1000

# Compute posterior distributions with laplace smoothing
french_post = ((X[y==1].sum(axis=0) + 1) / (n_fren + 2))
spanish_post= ((X[y==-1].sum(axis=0) + 1) / (n_span + 2))

# Find most expressive features
best_feats = np.abs((french_post - spanish_post)
					/ (french_post + spanish_post)
			).sort_values(ascending=False)

# Choose most expressive words for features
features_nb = best_feats[:num_features].index


# Compute entropy of features
X_vals = X.values
y_vals = y.values.reshape(-1,1)

n1 = X_vals.sum(axis=0)
n0 = X_vals.shape[0] - n1

mask1 = X_vals == 1
mask0 = X_vals == 0

p_fren_x1 = (((y_vals * mask1) == 1).sum(axis = 0) + 1) / (n1 + 2)
p_span_x1 = (((y_vals * mask1) == -1).sum(axis = 0) + 1) / (n1 + 2)

p_fren_x0 = (((y_vals * mask0) == 1).sum(axis = 0) + 1) / (n0 + 2)
p_span_x0 = (((y_vals * mask0) == -1).sum(axis = 0) + 1) / (n0 + 2)

S1 = -((p_fren_x1 * np.log(p_fren_x1)) + (p_span_x1 * np.log(p_span_x1)))
S0 = -((p_fren_x0 * np.log(p_fren_x0)) + (p_span_x0 * np.log(p_span_x0)))

S = ((n1 * S1) + (n0 * S0)) / (n1 + n0)
    
# Apply the function to all columns (vectorized)
feature_entropies = pd.Series(S, index = X.columns).sort_values()

features_ent = feature_entropies[:num_features].index

In [88]:
# Load and format test sets
import unicodedata

with open("french.txt", "r") as file:
    french = file.readlines()

with open("spanish.txt", "r") as file:
    spanish = file.readlines()

spanish_words = np.array([array for array in [line.split() for line in spanish] if len(array) == 3])[:,1]
french_words = np.array([array for array in [line.split() for line in french] if len(array) == 3])[:,1]

spanish_words = [word for word in spanish_words if len(word) >= 3]
french_words = [word for word in french_words if len(word) >= 3]

def remove_accents(text):
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)  # Normalize to decomposed form
        if unicodedata.category(c) != 'Mn')

spanish_words = np.array([remove_accents(word) for word in spanish_words])
french_words = np.array([remove_accents(word) for word in french_words])

spanish_df = pd.DataFrame(np.hstack([spanish_words.reshape(-1,1), np.array(["spanish"] * spanish_words.shape[0]).reshape(-1,1)]))
french_df = pd.DataFrame(np.hstack([french_words.reshape(-1,1), np.array(["french"] * french_words.shape[0]).reshape(-1,1)]))

test_large = pd.concat([french_df, spanish_df]).rename(columns={0:"word", 1:"label"}).reset_index()
test_large = test_large[~test_large['word'].duplicated(keep=False)]

test = pd.read_csv('test.csv')
duplicates = set(test["word"]).intersection(df["word"])
test = test[test["word"].apply(lambda x: x not in duplicates)]

In [89]:
class NBLanguageClassifier():

    def __init__(self, substr_lens = range(1,7), 
                 suffix_lens = range(1,5), 
                 prefix_lens = range(1,5),
                 num_top_words = np.inf):
        
        self.substrs = set()
        self.suffixes = set()
        self.prefixes = set()

        self.substr_lens = substr_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_top_words = num_top_words

        self.X = None
        self.y = None
        self.french_post = None
        self.french_prior = None
        self.spanish_post = None
        self.spanish_prior = None
       
    # Get all substrings of a word as a set
    def _get_substrings(self, word):
        n = len(word)
        return {word[i:i+k] for k in self.substr_lens if k <= n 
                                for i in range(1, n - k + 2)}
    
    def _get_suffixes(self, word):
        return {word[-i:] for i in self.suffix_lens}
    
    def _get_prefixes(self, word):
        return {word[:i] for i in self.prefix_lens}
    
    # Encode a word
    def _encode_word(self, word):
        cols = list(
              self.substrs.intersection(self._get_substrings(word))
              | self.suffixes.intersection(self._get_suffixes(word))
              | self.prefixes.intersection(self._get_prefixes(word))
        )
        return cols 

    # Predict a single word
    def predict_word(self, word):
        encoding = self._encode_word(word)

        return (
            self.french_post[encoding].sum() + self.french_prior
            > self.spanish_post[encoding].sum() + self.spanish_prior
            )

    # Fit the model with a dataframe of words and labels
    def fit(self, df):
        
        df = df.copy()

        # Re-label french and spanish
        df['label'] = df["label"].map({"french": 1, "spanish": 0})

        # One hot encode substrings
        substring_data = [
            {"word": word, **{sub: 1 for sub in self._get_substrings(word)}}
                for word in df["word"]
        ]
        substr_df = pd.DataFrame(substring_data).fillna(0).drop(columns=["word"])
        self.substrs = set(substr_df.columns.difference(["word"]))

        # One hot encode suffixes
        suffix_data = [
            {"word": word, **{"s_" + suff: 1 for suff in self._get_suffixes(word)}}
                for word in df["word"]
        ]
        suffix_df = pd.DataFrame(suffix_data).fillna(0).drop(columns=["word"])
        self.suffixes = set(suffix_df.columns.difference(["word"]))

        # One hot encode prefixes
        prefix_data = [
            {"word": word, **{"p_" + pref: 1 for pref in self._get_prefixes(word)}}
                for word in df["word"]
        ]
        prefix_df = pd.DataFrame(prefix_data).fillna(0).drop(columns=["word"])
        self.prefixes = set(prefix_df.columns.difference(["word"]))

        # Add all features to the df
        df = pd.concat([df, substr_df, suffix_df, prefix_df], axis = 1)
   
        # Format feature and reponse datastructures
        self.X = df.drop(columns=['word', 'label'])
        self.y = df["label"]

        # Compute priors
        self.french_prior = np.log((self.y==1).sum() / self.y.shape[0])
        self.spanish_prior= np.log((self.y==0).sum() / self.y.shape[0])

        # Compute probabiities of each feature per class
        self.french_post = ((self.X[self.y==1].sum(axis=0) + 1) 
                                   / ((self.y==1).sum() + 2))
        self.spanish_post= ((self.X[self.y==0].sum(axis=0) + 1) 
                                   / ((self.y==0).sum() + 2))

        # Choose the most expressive features
        if self.num_top_words * 2 < self.X.shape[1]:

            # Find most expressive words for each label
            best_subs = ((self.french_post - self.spanish_post)
                         / (self.french_post + self.spanish_post)).sort_values()
            best_spanish = best_subs.head(self.num_top_words)
            best_french = best_subs.tail(self.num_top_words)

            # Choose most expressive words for features
            self.features = pd.concat([best_spanish, best_french]).index

        # Else use all features
        else:
            self.features = self.X.columns

        self.french_post = np.log(self.french_post)
        self.spanish_post = np.log(self.spanish_post)

        # Set the available features
        self.substrs = set(self.features)

        # Compute final posteriors using available features
        self.french_post = self.french_post[self.features]
        self.spanish_post = self.spanish_post[self.features]
        
    # Predict words   
    def predict(self, df):
        return df["word"].apply(self.predict_word)
    
    # Compute score on a dataframe of words and labels
    def score(self, df, method = 'accuracy'):

        if method == 'accuracy':
            labels = df["label"].map({"french": 1, "spanish": 0})
            return (self.predict(df) == labels).sum() / df.shape[0]

In [100]:
class SVM:

    def __init__(self, ngram_lens, suffix_lens, prefix_lens, num_features, C,
                 feature_selection = "entropy"):
        
        """ Initialize the model with the given hyperparameters.

        Args:
            ngram_lens (list): The n-gram lengths to use.
            suffix_lens (list): The suffix lenghts to use.
            prefix_lens (list): The prefix lengths to use.
            num_features (int): The number of features to choose.
            C (int): The slack parameter for SVM.
        """

        self.ngrams_lens = ngram_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_features = num_features
        self.C = C
        self.feature_selection = feature_selection

        self.columns =  chain.from_iterable(
              [ngram_ixs[n] for n in ngram_lens]
            + [suffix_ixs[s] for s in suffix_lens]
            + [prefix_ixs[p] for p in prefix_lens]
        )

        self.X = None
        self.y = None
        self.w = None
        self.features = None

        self.n = None
        self.d = None



    def select_features(self):
        """Engineers and selects features using Naive Bayes probabilities.
        """

        # If there are less than the max features allowed, use all features
        if self.d <= self.num_features:
            print("Using all features")
            self.features = self.columns


        # Selects features by absolute difference in probability 
        elif self.feature_selection == "probability":

            n_fren = (self.y ==  1).sum()
            n_span = (self.y == -1).sum()

            # Compute posterior distributions with laplace smoothing
            french_post = ((self.X[self.y==1].sum(axis=0) + 1) 
                                    / (n_fren + 2))
            spanish_post= ((self.X[self.y==-1].sum(axis=0) + 1) 
                                    / (n_span + 2))

            # Find most expressive features
            best_feat = np.abs((french_post - spanish_post)
                            / (french_post + spanish_post)
                            ).sort_values(ascending=False)

        # Selects features by absolute minimal entropy in the class labels
        elif self.feature_selection == "entropy":

            # Reformat data
            X_vals = self.X.values
            y_vals = self.y.values.reshape(-1,1)

            # Count of 1's and 0's for each feature
            n1 = X_vals.sum(axis=0)
            n0 = X_vals.shape[0] - n1

            # Masks for where each feature is 1 and 0
            mask1 = X_vals == 1
            mask0 = X_vals == 0

            # Computes probability of labels when features are 1's
            p_fren_x1 = (((y_vals * mask1) == 1).sum(axis = 0) + 1) / (n1 + 2)
            p_span_x1 = (((y_vals * mask1) == -1).sum(axis = 0) + 1) / (n1 + 2)

            # Computes probability of labels when features are 0's
            p_fren_x0 = (((y_vals * mask0) == 1).sum(axis = 0) + 1) / (n0 + 2)
            p_span_x0 = (((y_vals * mask0) == -1).sum(axis = 0) + 1) / (n0 + 2)

            # Compute weighted entropy of the labels given feature values
            S1 = -(
               (p_fren_x1 * np.log(p_fren_x1)) + (p_span_x1 * np.log(p_span_x1))
            )
            S0 = -(
               (p_fren_x0 * np.log(p_fren_x0)) + (p_span_x0 * np.log(p_span_x0))
            )
            S = ((n1 * S1) + (n0 * S0)) / (n1 + n0)

            # Sorts features by minimal entropy 
            best_feat = pd.Series(S, index = self.X.columns).sort_values()

        else:
            print("Invalid feature selection method.")
            return

        # Choose most expressive features
        self.features = best_feat[:self.num_features].index

        self.X = self.X[self.features]
        self.d = self.X.shape[1]

    def fit(self, df):
        """_summary_

        Args:
            df (DataFrame): Dataframe of words and labels. Dataframe should not
            be reindexed from initialization.
        """
        self.X = X.iloc[df.index, self.columns]
        self.y = y.iloc[df.index]
        self.n, self.d = self.X.shape

        self.select_features()

        # Format as matrices
        self.X = np.array(self.X)
        self.y = np.array(self.y).reshape(-1,1)
        w_0 = np.zeros(self.d)
        
        # Optimize using L-BFGS-B (gradient descent is too slow)
        result = minimize(self._loss_fn, w_0, method='L-BFGS-B', 
                          jac=True, options={"maxiter": 10000})

        if not result.success:
            print("Optimize failed:")
            print(result.message)
            
        # Optimized weights
        self.w = result.x

    def predict(self,df, bagging = False):

        # Encode words into feature matrix
        X = self._encode(df["word"])

        # Compute decision values
        decision_values = X @ self.w

        # If not bagging, return decision labels
        if not bagging: 
            
            # Mapping of decision values
            mapping = {1:"french", -1:"spanish", 0:"french"}
            
            return pd.Series(np.sign(decision_values)).map(mapping)
        
        # If bagging, return raw decision values
        else: 

            return pd.Series(decision_values)
        
    # Compute score on a dataframe of words and labels
    def score(self, df, method = 'accuracy'):

        if method == 'accuracy':
            return (self.predict(df) == df["label"].reset_index(drop=True)).mean()
        
    def _encode(self, words):

        self.X_encode = pd.DataFrame(np.zeros((len(words), self.d), np.uint8), columns=self.features)

        word_features = {
            i: self.features.intersection(self._extract_features(word)) 
            for i, word in enumerate(words)
        }

        for index, feature_list in word_features.items():
            self.X_encode.loc[index, feature_list] = 1 

        return self.X_encode.values
             
    # Gets n-grams from a word, does not include first and last letter
    def _extract_features(self, word):

        # Generate n-grams
        n_grams = {
            f"{n}_" + word[i:i+n]
            for n in self.ngrams_lens
            for i in range(1, len(word) - n) if n <= len(word)
        }

        # Generate suffixes and prefixes
        suffixes = {f"s{s}_" + word[-s:] for s in self.suffix_lens}
        prefixes = {f"p{p}_" + word[:p] for p in self.suffix_lens}

        # Return as a set
        return n_grams | suffixes | prefixes

    # Get empirical risk with hinge loss
    def _get_risk(self, w):
        w = w.reshape(-1, 1)
        return ((w.T @ w) 
                + (self.C * np.maximum(0, 1 - (self.y * (self.X @ w))).sum())
                )[0,0]
    
    # Get subgradient with hinge loss
    def _get_gradient(self, w):
        w = w.reshape(-1, 1)

        # Get indices contributing to the sub gradient
        ixs =  (self.y * (self.X @ w) < 1).flatten()

        # Compute and return gradient
        return ((2 * w) 
                + (self.C * 
                   (-self.y[ixs] * self.X[ixs]).sum(axis = 0)).reshape(-1,1))
    
    # Defines loss function for L-BFGS-B optimization
    def _loss_fn(self, w):
        return self._get_risk(w), self._get_gradient(w)


In [91]:
class SVM_Bagging:

    def __init__(self, ngram_lens, suffix_lens, prefix_lens, num_features, C, 
                 n_bags, feature_selection = "entropy"):
        """ Initialize the model with the given hyperparameters.

        Args:
            ngram_lens (list): The n-gram lengths to use.
            suffix_lens (list): The suffix lenghts to use.
            prefix_lens (list): The prefix lengths to use.
            num_features (int): The number of features to choose.
            C (int): The slack parameter for SVM.
        """

        self.ngrams_lens = ngram_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_features = num_features
        self.C = C
        self.feature_selection = feature_selection
        self.n_bags = n_bags
        

    def fit(self, df):
        """_summary_

        Args:
            df (DataFrame): Dataframe of words and labels. Dataframe should not
            be reindexed from initialization.
        """

        hyperparams = {
            'ngram_lens' : self.ngrams_lens, 
            'suffix_lens': self.suffix_lens, 
            'prefix_lens': self.prefix_lens, 
            'num_features': self.num_features, 
            'C' : self.C,
            'feature_selection' : self.feature_selection
        }

        self.models = [SVM(**hyperparams) for i in range(self.n_bags)]

        for i in range(self.n_bags):
            bootstrap_sample = df.sample(frac=.75, replace=True)
            self.models[i].fit(bootstrap_sample)

    def predict(self, df):

        pedictions = pd.concat(
            [self.models[i].predict(df, bagging=True) for i in range(self.n_bags)],
                axis = 1                  
        )

        # Average the decision values across bags
        decision_values = pedictions.mean(axis=1)

        # Mapping of decision values
        mapping = {1:"french", -1:"spanish", 0:"french"}
            
        return pd.Series(np.sign(decision_values)).map(mapping)
        
   
    
    # Compute score on a dataframe of words and labels
    def score(self, df, method = 'accuracy'):

        if method == 'accuracy':
 
            predictions = self.predict(df).reset_index(drop=True)  # Reset index to match df
            true_labels = df["label"].reset_index(drop=True)

            #predictions, true_labels = predictions.align(true_labels, axis=1, copy=False)

            return (predictions == true_labels).mean()

In [92]:
class KernelSVM:
    
    def __init__(self, ngram_lens, suffix_lens, prefix_lens, num_features, C, gamma, kernel = "rbf"):
        """ Initialize the model with the given hyperparameters.

        Args:
            ngram_lens (list): The n-gram lengths to use.
            suffix_lens (list): The suffix lenghts to use.
            prefix_lens (list): The prefix lengths to use.
            num_features (int): The number of features to choose.
            C (float): The slack parameter for SVM.
            gamma (float): RBF Kernel parameter.
        """

        self.ngrams_lens = ngram_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_features = num_features
        self.C = C
        self.gamma = gamma
        self.kernel = kernel

        self.columns =  chain.from_iterable(
              [ngram_ixs[n] for n in ngram_lens]
            + [suffix_ixs[s] for s in suffix_lens]
            + [prefix_ixs[p] for p in prefix_lens])

        self.X = None
        self.y = None
        self.w = None
        self.features = None

        self.n = None
        self.d = None
        self.n_fren = None
        self.n_span = None

        self.french_prior = None
        self.spanish_prior = None

        self.french_post = None
        self.spanish_post = None

    def select_features(self):
        """Engineers and selects features using Naive Bayes probabilities.
        """

        # Include the prior in the features selection???
        # Compute prior probabilities
        self.french_prior = np.log(np.mean(self.y == 1))
        self.spanish_prior= np.log(np.mean(self.y == -1))

        # Compute posterior distributions with laplace smoothing
        self.french_post = ((self.X[self.y==1].sum(axis=0) + 1) 
                                   / (self.n_fren + 2))
        self.spanish_post= ((self.X[self.y==-1].sum(axis=0) + 1) 
                                   / (self.n_span + 2))
        
        # Choose the most expressive features
        if self.num_features < self.d:

            # Find most expressive features
            best_feat = np.abs((self.french_post - self.spanish_post)
                              / (self.french_post + self.spanish_post)
                            ).sort_values(ascending=False)

            # Choose most expressive words for features
            self.features = best_feat[:self.num_features].index

        else:
            self.features = self.X.columns

        # Compute log probabilites of the posteriors
        self.french_post = np.log(self.french_post[self.features])
        self.spanish_post = np.log(self.spanish_post[self.features])

        self.X = self.X[self.features]
        self.d = self.X.shape[1]

    def fit(self, df):
        """_summary_

        Args:
            df (DataFrame): Dataframe of words and labels. Dataframe should not
            be reindexed from initialization.
        """
        self.X = X.iloc[df.index, self.columns]
        self.y = y.iloc[df.index]
        self.n, self.d = self.X.shape
        self.n_fren = (self.y ==  1).sum()
        self.n_span = (self.y == -1).sum()
   
        self.select_features()

        # Format as matrices
        self.X = np.array(self.X)
        self.y = np.array(self.y).reshape(-1,1)
        alpha_0 = np.zeros(self.n)

        # Compute Kernel matrix
        self.K = self.get_kernel_matrix()

        # Optimize α using L-BFGS-B
        result = minimize(
            self._loss_fn, 
            alpha_0, 
            method="L-BFGS-B", 
            jac=True, 
            bounds=[(0, self.C) for _ in range(self.n)],  # 0 ≤ α ≤ C
            options={"maxiter": 10000}
        )

        if not result.success:
            print("Optimization failed: ", result.message)

        # Store optimized α values
        self.alpha = result.x

        # Store support vectors (where α > 0)
        self.support_vector_ixs = np.where(self.alpha > 1e-5)[0]
        self.support_vectors = self.X[self.support_vector_ixs]
        self.support_alpha = self.alpha[self.support_vector_ixs]
        self.support_y = self.y[self.support_vector_ixs]    
    
    def predict(self, df):
        """Predicts labels for new data using the trained kernel SVM.
        
        Args:
            X_new (ndarray): New feature matrix (num_samples, num_features)
        
        Returns:
            y_pred (ndarray): Predicted labels (-1 or 1)
        """
        # Encode words into feature matrix
        X = self._encode(df["word"])

        # Compute Kernel matrix between new samples and support vectors
        K_new = self.eval_kernel_matrix(np.array(X), np.array(self.support_vectors))

        # Compute decision function: f(x) = sum(α_i * y_i * K(x_i, x))
        decision_values = np.sign(
            np.dot(self.support_alpha * self.support_y.flatten(), K_new.T)
            )

        # Mapping of decision values
        mapping = {1:"french", -1:"spanish", 0:"french"}
        
        return pd.Series(decision_values).map(mapping)
    
    # Compute score on a dataframe of words and labels
    def score(self, df, method = 'accuracy'):

        if method == 'accuracy':
            pred = self.predict(df).reset_index(drop=True) 
            actual = df["label"].reset_index(drop=True)

            return (pred == actual).mean()
    
    def get_kernel_matrix(self):

        if self.kernel == "rbf":

            # Intermediate squares matrix
            X2 = np.sum(self.X**2, axis=1)

            # Compute squared distances
            sq_dists = (X2 + X2.reshape(-1, 1) - 2 * np.dot(self.X, self.X.T))
            
            # Return Gaussian RBF
            return np.exp(-self.gamma * sq_dists)
        
        elif self.kernel == "hamming":

            # Compute Hamming distance matrix (normalized by number of features)
            D = np.sum(self.X[:, None, :] != self.X[None, :, :], axis=2)

            # Apply exponential transformation
            return np.exp(-self.gamma * D)
        
        elif self.kernel == "polynomial":

            # Compute K matrix using dot product
            K = np.dot(self.X, self.X.T)
            
            # Apply polynomial transformation
            return (K + 1) ** 2   
        
        else: print("Invalid Kernel")

    def eval_kernel_matrix(self, X1, X2):
        
        if self.kernel == "rbf":

            # Compute squared norms
            X1_sq = np.sum(X1**2, axis=1)[:, None]  # Shape: (num_samples, 1)
            X2_sq = np.sum(X2**2, axis=1)[None, :]  # Shape: (1, num_support_vectors)
            
            # Compute squared distances
            sq_dists = X1_sq + X2_sq - 2 * np.dot(X1, X2.T)

            # Return Gaussian RBF
            return np.exp(-self.gamma * sq_dists)

        
        elif self.kernel == "hamming":

            # Compute Hamming distance matrix
            D = np.sum(X1[:, None, :] != X2[None, :, :], axis=2)

            # Apply exponential transformation
            return np.exp(-self.gamma * D)
        
        elif self.kernel == "polynomial":

            # Compute dot product
            K = np.dot(X1, X2.T)

            # Apply polynomial transformation
            return (K + 1) ** 2

    
    def _loss_fn(self, alpha):
        """Computes the dual objective function and its gradient.

        Args:
            alpha (ndarray): Lagrange multipliers (shape: n_samples,)

        Returns:
            tuple: (dual loss, gradient)
        """
        # Compute dual loss: 0.5 * α^T K α - sum(α)
        loss = 0.5 * np.dot(alpha, self.K @ alpha) - np.sum(alpha)

        # Compute gradient: K α - 1
        grad = self.K @ alpha - np.ones(self.n)

        return loss, grad
    
    def _encode(self, words):

        self.X_encode = pd.DataFrame(np.zeros((len(words), self.d), np.uint8), columns=self.features)

        word_features = {
            i: self.features.intersection(self._extract_features(word)) 
            for i, word in enumerate(words)
        }

        for index, feature_list in word_features.items():
            self.X_encode.loc[index, feature_list] = 1 

        return self.X_encode.values
    
    # Gets n-grams from a word, does not include first and last letter
    def _extract_features(self, word):

        # Generate n-grams
        n_grams = {
            f"{n}_" + word[i:i+n]
            for n in self.ngrams_lens
            for i in range(1, len(word) - n) if n <= len(word)
        }

        # Generate suffixes and prefixes
        suffixes = {f"s{s}_" + word[-s:] for s in self.suffix_lens}
        prefixes = {f"p{p}_" + word[:p] for p in self.suffix_lens}

        # Return as a set
        return n_grams | suffixes | prefixes



In [93]:
class KernelSVM_Bagging:

    def __init__(self, ngram_lens, suffix_lens, prefix_lens, num_features, C, gamma, n_bags=15):
        """ Initialize the model with the given hyperparameters.

        Args:
            ngram_lens (list): The n-gram lengths to use.
            suffix_lens (list): The suffix lenghts to use.
            prefix_lens (list): The prefix lengths to use.
            num_features (int): The number of features to choose.
            C (int): The slack parameter for SVM.
        """

        self.ngrams_lens = ngram_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_features = num_features
        self.C = C
        self.gamma = gamma
        self.n_bags = n_bags

    def fit(self, df):
        """_summary_

        Args:
            df (DataFrame): Dataframe of words and labels. Dataframe should not
            be reindexed from initialization.
        """

        hyperparams = {
            'ngram_lens' : self.ngrams_lens, 
            'suffix_lens': self.suffix_lens, 
            'prefix_lens': self.prefix_lens, 
            'num_features': self.num_features, 
            'C' : self.C,
            'gamma': self.gamma
        }

        self.models = [KernelSVM(**hyperparams) for i in range(self.n_bags)]

        for i in range(self.n_bags):
            bootstrap_sample = df.sample(frac=1.0, replace=True)
            self.models[i].fit(bootstrap_sample)

    def predict(self, df):

        pedictions = pd.concat(
            [self.models[i].predict(df) for i in range(self.n_bags)],
                axis = 1                  
        )
        return pedictions.mode(axis=1)[0]
    
    # Compute score on a dataframe of words and labels
    def score(self, df, method = 'accuracy'):

        if method == 'accuracy':
 
            predictions = self.predict(df).reset_index(drop=True)  # Reset index to match df
            true_labels = df["label"].reset_index(drop=True)

            #predictions, true_labels = predictions.align(true_labels, axis=1, copy=False)

            return (predictions == true_labels).mean()

In [94]:
class NB_SVM:

    def __init__(self, ngram_lens, suffix_lens, prefix_lens, num_features, C):
        """ Initialize the model with the given hyperparameters.

        Args:
            ngram_lens (list): The n-gram lengths to use.
            suffix_lens (list): The suffix lenghts to use.
            prefix_lens (list): The prefix lengths to use.
            num_features (int): The number of features to choose.
            C (int): The slack parameter for SVM.
        """

        self.ngrams_lens = ngram_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_features = num_features
        self.C = C

        self.columns =  chain.from_iterable(
              [ngram_ixs[n] for n in ngram_lens]
            + [suffix_ixs[s] for s in suffix_lens]
            + [prefix_ixs[p] for p in prefix_lens])

        self.X = None
        self.y = None
        self.w = None
        self.features = None

        self.n = None
        self.d = None
        self.n_fren = None
        self.n_span = None

        self.french_prior = None
        self.spanish_prior = None

        self.french_post = None
        self.spanish_post = None
        self.importance_map = None


    def select_features(self):
        """Engineers and selects features using Naive Bayes probabilities.
        """

        # Include the prior in the features selection???
        # Compute prior probabilities
        self.french_prior = np.log(np.mean(self.y == 1))
        self.spanish_prior= np.log(np.mean(self.y == -1))

        # Compute posterior distributions with laplace smoothing
        self.french_post = ((self.X[self.y==1].sum(axis=0) + 1) 
                                   / (self.n_fren + 2))
        self.spanish_post= ((self.X[self.y==-1].sum(axis=0) + 1) 
                                   / (self.n_span + 2))
        
        # Choose the most expressive features
        if self.num_features < self.d:

            # Find most expressive features
            best_feat = np.abs((self.french_post - self.spanish_post)
                              / (self.french_post + self.spanish_post)
                            ).sort_values(ascending=False)

            # Choose most expressive words for features
            self.features = best_feat[:self.num_features].index

        else:
            self.features = self.X.columns

        # Compute the importance of each feature
        self.importance_map = np.abs(
            (self.french_post[self.features] - self.spanish_post[self.features])
          / (self.french_post[self.features] + self.spanish_post[self.features])
        )

        # Compute log probabilites of the posteriors
        self.french_post = np.log(self.french_post[self.features])
        self.spanish_post = np.log(self.spanish_post[self.features])
        
        self.X = self.X[self.features].multiply(self.importance_map, axis = 1)
        self.d = self.X.shape[1]

    def fit(self, df):
        """_summary_

        Args:
            df (DataFrame): Dataframe of words and labels. Dataframe should not
            be reindexed from initialization.
        """
        self.X = X.iloc[df.index, self.columns]
        self.y = y.iloc[df.index]
        self.n, self.d = self.X.shape
        self.n_fren = (self.y ==  1).sum()
        self.n_span = (self.y == -1).sum()
   
        self.select_features()

        # Format as matrices
        self.X = np.array(self.X)
        self.y = np.array(self.y).reshape(-1,1)
        w_0 = np.zeros(self.d)
        
        # Optimize using L-BFGS-B (gradient descent is too slow)
        result = minimize(self._loss_fn, w_0, method='L-BFGS-B', jac=True, options={"maxiter": 10000})

        if not result.success:
            print("Optimize failed:")
            print(result.message)
            
        # Optimized weights
        self.w = result.x

    def predict(self,df):

        # Encode words into feature matrix
        X = self._encode(df["word"])

        # Compute decision values
        decision_values = np.sign(X @ self.w)  # Matrix-vector multiplication

        # Mapping of decision values
        mapping = {1:"french", -1:"spanish", 0:"french"}
        
        return pd.Series(decision_values).map(mapping)
    
    # Compute score on a dataframe of words and labels
    def score(self, df, method = 'accuracy'):

        if method == 'accuracy':
            return (self.predict(df) == df["label"].reset_index(drop=True)).mean()
        
    def _encode(self, words):

        self.X_encode = pd.DataFrame(np.zeros((len(words), self.d), np.uint8), columns=self.features)

        word_features = {
            i: self.features.intersection(self._extract_features(word)) 
            for i, word in enumerate(words)
        }

        for index, feature_list in word_features.items():
            self.X_encode.loc[index, feature_list] = 1

        importance_map = self.importance_map[self.X_encode.columns] 
        self.X_encode.multiply(importance_map, axis = 1)

        return self.X_encode.values
            
        
    # Gets n-grams from a word, does not include first and last letter
    def _extract_features(self, word):

        # Generate n-grams
        n_grams = {
            f"{n}_" + word[i:i+n]
            for n in self.ngrams_lens
            for i in range(1, len(word) - n) if n <= len(word)
        }

        # Generate suffixes and prefixes
        suffixes = {f"s{s}_" + word[-s:] for s in self.suffix_lens}
        prefixes = {f"p{p}_" + word[:p] for p in self.suffix_lens}

        # Return as a set
        return n_grams | suffixes | prefixes

    # Get empirical risk with hinge loss
    def _get_risk(self, w):
        w = w.reshape(-1, 1)
        return ((w.T @ w) 
                + (self.C * np.maximum(0, 1 - (self.y * (self.X @ w))).sum())
                )[0,0]
    
    # Get subgradient with hinge loss
    def _get_gradient(self, w):
        w = w.reshape(-1, 1)

        # Get indices contributing to the sub gradient
        ixs =  (self.y * (self.X @ w) < 1).flatten()

        # Compute and return gradient
        return ((2 * w) 
                + (self.C * 
                   (-self.y[ixs] * self.X[ixs]).sum(axis = 0)).reshape(-1,1))
    
    # Defines loss function for L-BFGS-B optimization
    def _loss_fn(self, w):
        return self._get_risk(w), self._get_gradient(w)


In [95]:
class DecisionTree:

    class Node:

        def __init__(self, tree, sorts, depth):
            
            self.tree = tree
            self.sorts = sorts
            self.depth = depth
            self.y = self.tree.y[sorts[:,0]]
            self.n = self.y.shape[0]

            self.threshold = None
            self.feature = None

            self.left = None
            self.right = None
            
            if (depth < self.tree.max_depth and np.unique(self.y).shape[0] > 1):

                self.is_splitable = True

            else: 
                self.is_splitable = False
                y_counts = np.unique(self.y, return_counts=True)
                self.predict = y_counts[0][np.argmax(y_counts[1])]
        
        def compute_uncertainty(self, labels1, labels2):      
            
            if self.tree.criterion == 'gini': 

                counts1 = np.unique(labels1.values, return_counts=True)
                counts2 = np.unique(labels2.values, return_counts=True)

                n1 = counts1[1].sum()
                n2 = counts2[1].sum()

                p1 = counts1[1] / n1
                p2 = counts2[1] / n2

                G1 = 1 - (p1 ** 2).sum()
                G2 = 1 - (p2 ** 2).sum()

                G = ((n1 * G1) + (n2 * G2)) / (n1 + n2)

                return G

            elif self.tree.criterion == 'entropy':

                counts1 = np.unique(labels1, return_counts=True)
                counts2 = np.unique(labels2, return_counts=True)

                n1 = counts1[1].sum()
                n2 = counts2[1].sum()

                p1 = counts1[1] / n1
                p2 = counts2[1] / n2

                S1 = -(p1 * np.log(p1)).sum()
                S2 = -(p2 * np.log(p2)).sum()

                S = ((n1 * S1) + (n2 * S2)) / (n1 + n2)
                
                return S
            
            else:
                print("Invalid uncertainty selection.")

        def split(self):

            entropies = self.X.apply(lambda col: 
                                 self.compute_uncertainty(self.y[col == 1],
                                                          self.y[col == -1]))            
            print(f"Entropies: {entropies}")
            X_sorted = self.tree.X[self.sorts, range(self.tree.d)]
            dec_bounds = (X_sorted[:-1] + X_sorted[1:]) / 2
        
            uncertainties = np.empty_like(dec_bounds)

            for j in np.arange(self.tree.d):
                for i in np.arange(self.n - 1):

                        uncertainty = self.compute_uncertainty(
                                self.tree.y[self.sorts[:i+1, j]],
                                self.tree.y[self.sorts[i+1:, j]])
                        
                        uncertainties[i,j] = uncertainty

            i_best, j_best = np.unravel_index(
                np.argmin(uncertainties), uncertainties.shape
                )
            
            self.threshold = dec_bounds[i_best,j_best]
            self.feature = j_best

            left_idxs = self.sorts[:i_best + 1, self.feature]
            right_idxs = self.sorts[i_best +1:, self.feature]

            lookup = np.zeros(X.shape[0], dtype= bool)
            lookup[left_idxs] = True

            mask_left = lookup[self.sorts]
            mask_right = ~mask_left

            sorts_left = (
                self.sorts.T[mask_left.T]
                .reshape(len(left_idxs), -1, order='F')
            )
            
            sorts_right = (
                self.sorts.T[mask_right.T]
                .reshape(len(right_idxs), -1, order='F')
            )

            self.left = self.tree.Node(self.tree, sorts_left, self.depth + 1)
            self.right = self.tree.Node(self.tree, sorts_right, self.depth +1)
                    
    def __init__(self, ngram_lens, suffix_lens, prefix_lens, num_features, 
                 feature_selection = 'entropy', max_depth = 1000, 
                 criterion = 'entropy'):

        self.ngrams_lens = ngram_lens
        self.suffix_lens = suffix_lens
        self.prefix_lens = prefix_lens
        self.num_features = num_features
        self.feature_selection = feature_selection
        
        self.max_depth = max_depth
        self.criterion = criterion

        self.X = None
        self.y = None
        self.n = None
        self.d = None
        self.features = None

        self.sorts = None
        self.root = None

        self.columns =  chain.from_iterable(
              [ngram_ixs[n] for n in ngram_lens]
            + [suffix_ixs[s] for s in suffix_lens]
            + [prefix_ixs[p] for p in prefix_lens]
        )
    
    def select_features(self):
        """Engineers and selects features using Naive Bayes probabilities.
        """

        # If there are less than the max features allowed, use all features
        if self.d <= self.num_features:
            print("Using all features")
            self.features = self.columns


        # Selects features by absolute difference in probability 
        elif self.feature_selection == "probability":

            n_fren = (self.y ==  1).sum()
            n_span = (self.y == -1).sum()

            # Compute posterior distributions with laplace smoothing
            french_post = ((self.X[self.y==1].sum(axis=0) + 1) 
                                    / (n_fren + 2))
            spanish_post= ((self.X[self.y==-1].sum(axis=0) + 1) 
                                    / (n_span + 2))

            # Find most expressive features
            best_feat = np.abs((french_post - spanish_post)
                            / (french_post + spanish_post)
                            ).sort_values(ascending=False)

        # Selects features by absolute minimal entropy in the class labels
        elif self.feature_selection == "entropy":

            # Reformat data
            X_vals = self.X.values
            y_vals = self.y.values.reshape(-1,1)

            # Count of 1's and 0's for each feature
            n1 = X_vals.sum(axis=0)
            n0 = X_vals.shape[0] - n1

            # Masks for where each feature is 1 and 0
            mask1 = X_vals == 1
            mask0 = X_vals == 0

            # Computes probability of labels when features are 1's
            p_fren_x1 = (((y_vals * mask1) == 1).sum(axis = 0) + 1) / (n1 + 2)
            p_span_x1 = (((y_vals * mask1) == -1).sum(axis = 0) + 1) / (n1 + 2)

            # Computes probability of labels when features are 0's
            p_fren_x0 = (((y_vals * mask0) == 1).sum(axis = 0) + 1) / (n0 + 2)
            p_span_x0 = (((y_vals * mask0) == -1).sum(axis = 0) + 1) / (n0 + 2)

            # Compute weighted entropy of the labels given feature values
            S1 = -(
               (p_fren_x1 * np.log(p_fren_x1)) + (p_span_x1 * np.log(p_span_x1))
            )
            S0 = -(
               (p_fren_x0 * np.log(p_fren_x0)) + (p_span_x0 * np.log(p_span_x0))
            )
            S = ((n1 * S1) + (n0 * S0)) / (n1 + n0)

            # Sorts features by minimal entropy 
            best_feat = pd.Series(S, index = self.X.columns).sort_values()

        else:
            print("Invalid feature selection method.")
            return

        # Choose most expressive features
        self.features = best_feat[:self.num_features].index

        self.X = self.X[self.features]
        self.d = self.X.shape[1]
    
    def fit(self, df):

        self.X = X.iloc[df.index, self.columns]
        self.y = y.iloc[df.index]
        self.n, self.d = self.X.shape

        self.select_features()

        # Reformat data and labels as matrices
        self.X = np.array(self.X)
        self.y = np.array(self.y).reshape(-1,1)

        self.sorts = np.argsort(self.X, axis = 0)
        self.root = self.Node(self, self.sorts, 0)
        
        self.build_tree(self.root)

    def build_tree(self, node):

        if not node.is_splitable:

            return
        
        else: 

            node.split()
            self.build_tree(node.left)
            self.build_tree(node.right)

        return
    
    def predict(self, df):

        X = self._encode(df["word"])

        curr_node = self.root

        while curr_node.is_splitable:
            ...

            
        

    def _encode(self, words):

        self.X_encode = pd.DataFrame(np.zeros((len(words), self.d), np.uint8), columns=self.features)

        word_features = {
            i: self.features.intersection(self._extract_features(word)) 
            for i, word in enumerate(words)
        }

        for index, feature_list in word_features.items():
            self.X_encode.loc[index, feature_list] = 1 

        return self.X_encode.values
        
    

In [96]:
def cross_validate(model, df, k = 10, verbose = True, **kwargs):

    # Stratify samples for equal distribution
    french = df[df["label"]=='french'].sample(frac=1)
    spanish = df[df["label"]=='spanish'].sample(frac=1)
    french_indices = np.array_split(french.index, k)
    span_indices = np.array_split(spanish.index, k)
    
    # Get folds
    folds = [
        pd.concat([french.loc[french_indices[i]], spanish.loc[span_indices[i]]], axis=0)
        for i in range(k)
    ]

    # Get training sets
    train_sets = [
        pd.concat([folds[i] for i in range(k) if i != j]) for j in range(k)
    ]

    # Initialize models
    models = [model(**kwargs) for i in range(k)]

    if verbose: print(f"Starting {k} folds: ", end="")
    # Fit and score folds
    scores = []
    for i in range(k):
        models[i].fit(train_sets[i])
        scores.append(models[i].score(folds[i]))
        if verbose and i < k: print(f"{i+1} ... ", end = "")
    if verbose: print("\r" + " " * 100 + "\r", end="")
    return sum(scores) / k

In [97]:
def grid_search_cv(model, grid, df, num_folds = 10, repeats = 1):
    n = len(grid)

    # Array to hold the scores of each fold for each hyperparam set for each repeat
    scores = np.zeros((repeats,n,num_folds))     

    # Array to hold training set scores
    train_scores = np.zeros(n)

    # For each reapeat
    for r in range(repeats): 
        
        # Stratify samples for equal distribution
        french = df[df["label"]=='french'].sample(frac=1)
        spanish = df[df["label"]=='spanish'].sample(frac=1)
        french_indices = np.array_split(french.index, num_folds)
        span_indices = np.array_split(spanish.index, num_folds)
        
        # Get folds
        folds = [
            pd.concat([french.loc[french_indices[i]], spanish.loc[span_indices[i]]], axis=0)
            for i in range(num_folds)
        ]

        # Get training sets
        train_sets = [
            pd.concat([folds[i] for i in range(num_folds) if i != j]) for j in range(num_folds)
        ]

        # For each set of hyperparameters in the grid
        for i, hyperparams in enumerate(grid):

            if r == 0:
                train_model = model(**hyperparams)
                train_model.fit(df)
                train_scores[i] = np.round(train_model.score(df) * 100, 3)

            print(f"Iteration: {((i+1) + (n * r))} of {(n * repeats)}:")
            print(hyperparams)

            # Initialize k models
            models = [model(**hyperparams) for k in range(num_folds)]

            print(f"Starting {num_folds} folds: ", end="")
            # For each fold
            for k in range(num_folds):

                # Fit and score
                models[k].fit(train_sets[k])
                scores[r,i,k] = models[k].score(folds[k])
                print(f"{k+1} ... ", end = "")

            # Store this repeat's score for the given hyperparameter
            cv_score = scores[r,i,:].mean()
            print("\r" + " " * 100 + "\r", end="")
            print(f"cv score: {float(np.round(cv_score * 100, 3))}")
            print()

    mean_scores = pd.Series(np.round(np.mean(scores*100, axis=(0, 2)), 3), name = 'cv_mean')
    std_scores = pd.Series(np.round(np.std(scores*100, axis=(0, 2), ddof=1), 5), name='cv_std')

    results = pd.concat([pd.Series(train_scores).rename("train"),
                         mean_scores, std_scores,
                         pd.DataFrame(grid),
                         pd.Series(grid).rename("hyperparams")], axis = 1)

    return results

In [98]:
def test_model(model, hyperparams, k = 5):
    model_t = model(**hyperparams)
    model_t.fit(df)
    print(f"trainer score : {round(model_t.score(df) * 100, 3)}")
    print(f"test S score  : {round(model_t.score(test)* 100 ,3)}")
    print(f"test L score  : {round(model_t.score(test_large)* 100 ,3)}")
    print(f"cross-v score : {float(round(cross_validate(model, df, k=k, **hyperparams) * 100, 3))}")

In [84]:
hyperparams = {
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range, 
    'num_features': 100, 
}
model = DecisionTree(**hyperparams)
model.fit(df)

AttributeError: 'Node' object has no attribute 'X'

In [69]:
model.root.feature

np.int64(74)

In [None]:
hyperparams = {
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range, 
    'num_features': 250, 
    'C' : .35,
    'n_bags': 30
}

test_model(SVM_Bagging, hyperparams, k = 10)

trainer score : 93.417
test S score  : 90.857
test L score  : 89.007
cross-v score : 86.083                                                                              


In [103]:
hyperparams = {
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range, 
    'num_features': 250, 
    'C' : .35,
    'feature_selection': 'entropy'
}

test_model(SVM, hyperparams, k = 5)

trainer score : 89.333
test S score  : 91.429
test L score  : 87.475
cross-v score : 87.25                                                                               


In [64]:
hyperparams = {
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range, 
    'num_features': 700, 
    'C' : 100,
    'gamma': 0,
    'kernel': "rbf"
}

test_model(KernelSVM, hyperparams, k = 10)

trainer score : 50.0
test S score  : 50.286
test L score  : 50.766
cross-v score : 50.0                                                                                


In [658]:
hyperparams = {
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range, 
    'num_features': 300, 
    'C' : 1,
    'gamma': 2,
    'n_bags': 31
}

test_model(KernelSVM_Bagging, hyperparams, k = 10)

trainer score : 89.25
test S score  : 91.429
test L score  : 87.209
Starting 10 folds: 

KeyboardInterrupt: 

In [14]:
hyperparams = {
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range, 
    'num_features': 400, 
    'C' : 50
}

test_model(NB_SVM, hyperparams, k = 10)

NameError: name 'test_model' is not defined

##### Grid Search 1

In [442]:
num_features = [200, 400, 600, 800, 1000]
Cs = [ 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 5, repeats = 45)

Iteration: 1 of 2475:
{'num_features': 200, 'C': 0.001, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.167                                  

Iteration: 2 of 2475:
{'num_features': 200, 'C': 0.005, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.083                                  

Iteration: 3 of 2475:
{'num_features': 200, 'C': 0.01, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.0                                    

Iteration: 4 of 2475:
{'num_features': 200, 'C': 0.05, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.583                                  

Iteration: 5 of 2475:
{'num_features': 200, 'C': 0.1, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.75                                   

Iteration: 6 of 2475:
{'num_features': 200, 

In [443]:
grid_df = pd.DataFrame(grid_search_results)
grid_df = (grid_df
           .drop(columns=["ngram_lens", "suffix_lens","prefix_lens"])
           .sort_values(by="cv", ascending= False))
grid_df


Unnamed: 0,cv,num_features,C,hyperparams
49,86.313,1000,0.5,"{'num_features': 1000, 'C': 0.5, 'ngram_lens':..."
16,86.28,400,0.5,"{'num_features': 400, 'C': 0.5, 'ngram_lens': ..."
48,86.259,1000,0.1,"{'num_features': 1000, 'C': 0.1, 'ngram_lens':..."
38,86.254,800,0.5,"{'num_features': 800, 'C': 0.5, 'ngram_lens': ..."
27,86.228,600,0.5,"{'num_features': 600, 'C': 0.5, 'ngram_lens': ..."
17,86.18,400,1.0,"{'num_features': 400, 'C': 1, 'ngram_lens': (1..."
37,86.178,800,0.1,"{'num_features': 800, 'C': 0.1, 'ngram_lens': ..."
26,86.178,600,0.1,"{'num_features': 600, 'C': 0.1, 'ngram_lens': ..."
28,86.093,600,1.0,"{'num_features': 600, 'C': 1, 'ngram_lens': (1..."
50,86.063,1000,1.0,"{'num_features': 1000, 'C': 1, 'ngram_lens': (..."


In [453]:
gsr = grid_search_results.copy()
gsr["C"] = grid_search_results["C"].astype(str)
gsr["num_features"] = grid_search_results["num_features"]
heatmap_data = gsr.pivot(index="num_features", columns="C", values="cv")
fig = px.imshow(
    heatmap_data, 
    labels=dict(x="C (Regularization)", y="Number of Features", color="CV Score"),
    color_continuous_scale="Viridis",  # Color options: 'Blues', 'Inferno', 'Magma'
    aspect="auto"
)

# Add titles and format layout
fig.update_layout(
    title="Hyperparameter Heatmap (C vs. Num Features)",
    xaxis_title="C (Regularization Parameter)",
    yaxis_title="Number of Features",
)

fig.show()


##### Grid Search 2

In [66]:
num_features = [500]
Cs = [ 0.1, 0.5]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 5, repeats = 1)

Iteration: 1 of 2:
{'num_features': 500, 'C': 0.1, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 86.917                                  

Iteration: 2 of 2:
{'num_features': 500, 'C': 0.5, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 86.583                                  



In [457]:
grid_df = pd.DataFrame(grid_search_results)
grid_df = (grid_df
           .drop(columns=["ngram_lens", "suffix_lens","prefix_lens"])
           .sort_values(by="cv", ascending= False))
grid_df


Unnamed: 0,cv,num_features,C,hyperparams
1,86.231,1200,0.5,"{'num_features': 1200, 'C': 0.5, 'ngram_lens':..."
0,86.211,1200,0.1,"{'num_features': 1200, 'C': 0.1, 'ngram_lens':..."


##### Grid Search 3

In [458]:
num_features = [900,1000,1100]
Cs = [0.25 ,0.5, 0.75]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 10, repeats = 30)

Iteration: 1 of 270:
{'num_features': 900, 'C': 0.25, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 87.167                                   ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 2 of 270:
{'num_features': 900, 'C': 0.5, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 87.333                                   ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 3 of 270:
{'num_features': 900, 'C': 0.75, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 87.083                                   ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 4 of 270:
{'num_features': 1000, 'C': 0.25, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 87.333                                   ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 5 of 270:
{'num_features': 1000, 'C': 0.5, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_le

In [460]:
grid_df = pd.DataFrame(grid_search_results)
grid_df = (grid_df
           .drop(columns=["ngram_lens", "suffix_lens","prefix_lens"])
           .sort_values(by="cv", ascending= False))
grid_df

Unnamed: 0,cv,num_features,C,hyperparams
6,86.428,1100,0.25,"{'num_features': 1100, 'C': 0.25, 'ngram_lens'..."
7,86.428,1100,0.5,"{'num_features': 1100, 'C': 0.5, 'ngram_lens':..."
3,86.419,1000,0.25,"{'num_features': 1000, 'C': 0.25, 'ngram_lens'..."
4,86.378,1000,0.5,"{'num_features': 1000, 'C': 0.5, 'ngram_lens':..."
0,86.361,900,0.25,"{'num_features': 900, 'C': 0.25, 'ngram_lens':..."
1,86.317,900,0.5,"{'num_features': 900, 'C': 0.5, 'ngram_lens': ..."
2,86.272,900,0.75,"{'num_features': 900, 'C': 0.75, 'ngram_lens':..."
5,86.233,1000,0.75,"{'num_features': 1000, 'C': 0.75, 'ngram_lens'..."
8,86.217,1100,0.75,"{'num_features': 1100, 'C': 0.75, 'ngram_lens'..."


In [525]:
with open("gsr.txt", "r") as file:
    lines = file.readlines()
cv_scores = [float(line[10:16]) for line in lines if line[:2] == "cv"]
all_ser = pd.concat([pd.Series([cv_scores[j+i] for i in range(0,len(cv_scores)-j, 9)]) for j in range(9)], axis = 1)
vars = all_ser.std(axis = 0)

In [528]:
grid_search_results

Unnamed: 0,cv,num_features,C,ngram_lens,suffix_lens,prefix_lens,hyperparams
0,86.361,900,0.25,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 900, 'C': 0.25, 'ngram_lens':..."
1,86.317,900,0.5,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 900, 'C': 0.5, 'ngram_lens': ..."
2,86.272,900,0.75,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 900, 'C': 0.75, 'ngram_lens':..."
3,86.419,1000,0.25,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 1000, 'C': 0.25, 'ngram_lens'..."
4,86.378,1000,0.5,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 1000, 'C': 0.5, 'ngram_lens':..."
5,86.233,1000,0.75,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 1000, 'C': 0.75, 'ngram_lens'..."
6,86.428,1100,0.25,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 1100, 'C': 0.25, 'ngram_lens'..."
7,86.428,1100,0.5,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 1100, 'C': 0.5, 'ngram_lens':..."
8,86.217,1100,0.75,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)","{'num_features': 1100, 'C': 0.75, 'ngram_lens'..."


In [526]:
gsr = grid_search_results.copy()
gsr["cv"] = gsr["cv"] - vars
gsr["C"] = grid_search_results["C"].astype(str)
gsr["num_features"] = grid_search_results["num_features"]
heatmap_data = gsr.pivot(index="num_features", columns="C", values="cv")
fig = px.imshow(
    heatmap_data, 
    labels=dict(x="C (Regularization)", y="Number of Features", color="CV Score"),
    color_continuous_scale="Viridis",  # Color options: 'Blues', 'Inferno', 'Magma'
    aspect="auto"
)

# Add titles and format layout
fig.update_layout(
    title="Hyperparameter Heatmap (C vs. Num Features)",
    xaxis_title="C (Regularization Parameter)",
    yaxis_title="Number of Features",
)

fig.show()

### Grid Search 5

In [122]:
num_features = [200, 300, 400, 500, 600, 700, 800]
Cs = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 10, repeats = 30)

Iteration: 1 of 1470:
{'num_features': 200, 'C': 0.2, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.75                                    ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 2 of 1470:
{'num_features': 200, 'C': 0.3, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.75                                    ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 3 of 1470:
{'num_features': 200, 'C': 0.4, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 86.0                                     ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 4 of 1470:
{'num_features': 200, 'C': 0.5, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6)}
cv score: 85.75                                    ... 7 ... 8 ... 9 ... 10 ... 

Iteration: 5 of 1470:
{'num_features': 200, 'C': 0.6, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_le

In [123]:
# Save Grid Search
grid_df = pd.DataFrame(grid_search_results)
grid_df.to_csv('grid_search_results_4.csv')

In [136]:
grid_search_results.sort_values(by='cv_mean', ascending = False)[["train", "cv_mean", "cv_std", "C", "num_features"]]

Unnamed: 0,train,cv_mean,cv_std,C,num_features
44,91.833,86.564,2.79288,0.4,800
42,91.5,86.542,2.79363,0.2,800
46,92.25,86.531,2.82864,0.6,800
45,92.0,86.519,2.81738,0.5,800
47,92.25,86.506,2.83757,0.7,800
48,92.25,86.467,2.87818,0.8,800
35,91.25,86.456,2.79965,0.2,700
27,90.75,86.453,2.80317,0.8,500
43,91.75,86.447,2.81597,0.3,800
20,89.917,86.414,2.74716,0.8,400


In [141]:
gsr = grid_search_results.copy()
gsr["score"] = gsr["cv_mean"] - 1 * gsr["cv_std"]
gsr["C"] = grid_search_results["C"].astype(str)
gsr["num_features"] = grid_search_results["num_features"]
heatmap_data = gsr.pivot(index="num_features", columns="C", values="score")
fig = px.imshow(
    heatmap_data, 
    labels=dict(x="C (Regularization)", y="Number of Features", color="CV Score"),
    color_continuous_scale="Viridis",  # Color options: 'Blues', 'Inferno', 'Magma'
    aspect="auto"
)

# Add titles and format layout
fig.update_layout(
    title="Hyperparameter Heatmap (C vs. Num Features)",
    xaxis_title="C (Regularization Parameter)",
    yaxis_title="Number of Features",
)

fig.show()

### Grid Search 5

In [22]:
num_features = [300, 400, 500, 600, 700, 800, 900]
Cs = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range,
    'feature_selection': 'entropy'
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 10, repeats = 15)

grid_df = pd.DataFrame(grid_search_results)
grid_df.to_csv('grid_search_results_5.csv')

Iteration: 1 of 840:
{'num_features': 300, 'C': 0.2, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.583                                                                                    

Iteration: 2 of 840:
{'num_features': 300, 'C': 0.3, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.583                                                                                    

Iteration: 3 of 840:
{'num_features': 300, 'C': 0.4, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.667                                                                                    

Iteration: 4 of 840:
{'num_features': 300, 'C': 0.5, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.5    

In [31]:
grid_search_results.sort_values(by='cv_mean', ascending = False)[["train", "cv_mean", "cv_std", "C", "num_features"]]

Unnamed: 0,train,cv_mean,cv_std,C,num_features
1,89.417,87.206,3.07157,0.3,300
2,89.583,87.1,3.01735,0.4,300
8,89.917,87.056,2.89749,0.2,400
3,90.0,87.022,2.92739,0.5,300
16,90.0,87.0,2.99608,0.2,500
0,89.0,86.978,2.97821,0.2,300
9,90.167,86.95,3.02369,0.3,400
4,90.333,86.911,2.91334,0.6,300
17,90.5,86.9,2.98692,0.3,500
10,90.667,86.772,3.06269,0.4,400


In [None]:
gsr = grid_search_results.copy()
gsr["score"] = gsr["cv_mean"]
gsr["C"] = grid_search_results["C"].astype(str)
gsr["num_features"] = grid_search_results["num_features"]
heatmap_data = gsr.pivot(index="num_features", columns="C", values="score")
fig = px.imshow(
    heatmap_data, 
    labels=dict(x="C (Regularization)", y="Number of Features", color="CV Score"),
    color_continuous_scale="Viridis",  # Color options: 'Blues', 'Inferno', 'Magma'
    aspect="auto"
)

# Add titles and format layout
fig.update_layout(
    title="Hyperparameter Heatmap (C vs. Num Features)",
    xaxis_title="C (Regularization Parameter)",
    yaxis_title="Number of Features",
)

fig.show()

### Grid Search 6

In [56]:
num_features = [200, 250, 275, 300, 325, 350]
Cs = [0.2, 0.25, 0.3, 0.35, 0.4]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range,
    'feature_selection': 'entropy'
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 10, repeats = 30)

grid_df = pd.DataFrame(grid_search_results)
grid_df.to_csv('grid_search_results_6.csv')

Iteration: 1 of 900:
{'num_features': 200, 'C': 0.2, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 86.583                                                                                    

Iteration: 2 of 900:
{'num_features': 200, 'C': 0.25, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 86.75                                                                                     

Iteration: 3 of 900:
{'num_features': 200, 'C': 0.3, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 86.667                                                                                    

Iteration: 4 of 900:
{'num_features': 200, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 86.667

In [57]:
grid_search_results.sort_values(by='cv_mean', ascending = False)[["train", "cv_mean", "cv_std", "C", "num_features"]]

Unnamed: 0,train,cv_mean,cv_std,C,num_features
8,89.333,87.117,2.85297,0.35,250
7,89.167,87.094,2.90318,0.3,250
12,89.25,87.089,2.92233,0.3,275
17,89.417,87.086,2.93106,0.3,300
13,89.417,87.081,2.94056,0.35,275
18,89.5,87.078,2.95866,0.35,300
9,89.5,87.078,2.85642,0.4,250
6,89.083,87.072,2.95079,0.25,250
16,89.417,87.056,2.96168,0.25,300
14,89.583,87.044,2.9474,0.4,275


In [62]:
gsr = grid_search_results.copy()
gsr["score"] = gsr["cv_mean"]
gsr["C"] = grid_search_results["C"].astype(str)
gsr["num_features"] = grid_search_results["num_features"]
heatmap_data = gsr.pivot(index="num_features", columns="C", values="score")
fig = px.imshow(
    heatmap_data, 
    labels=dict(x="C (Regularization)", y="Number of Features", color="CV Score"),
    color_continuous_scale="Viridis",  # Color options: 'Blues', 'Inferno', 'Magma'
    aspect="auto"
)

# Add titles and format layout
fig.update_layout(
    title="Hyperparameter Heatmap (C vs. Num Features)",
    xaxis_title="C (Regularization Parameter)",
    yaxis_title="Number of Features",
    height = 600, width = 700
)

fig.show()

In [66]:
num_features = [250]
Cs = [0.35]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range,
    'feature_selection': 'entropy'
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 10, repeats = 30)

grid_df = pd.DataFrame(grid_search_results)

Iteration: 1 of 30:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.583                                                                                    

Iteration: 2 of 30:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.0                                                                                      

Iteration: 3 of 30:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.5                                                                                      

Iteration: 4 of 30:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.083  

In [67]:
grid_df

Unnamed: 0,train,cv_mean,cv_std,num_features,C,ngram_lens,suffix_lens,prefix_lens,feature_selection,hyperparams
0,89.333,87.075,2.67241,250,0.35,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)",entropy,"{'num_features': 250, 'C': 0.35, 'ngram_lens':..."


In [19]:
num_features = [250]
Cs = [0.35]

grid = [{
    'num_features': n,
    'C' : c,
    'ngram_lens' : ngram_range, 
    'suffix_lens': suffix_range, 
    'prefix_lens': prefix_range,
    'feature_selection': 'entropy',
    } for n in num_features for c in Cs]

grid_search_results = grid_search_cv(SVM, grid, df, num_folds = 10, repeats = 10)

grid_df = pd.DataFrame(grid_search_results)

Iteration: 1 of 10:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.0                                                                                      

Iteration: 2 of 10:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.25                                                                                     

Iteration: 3 of 10:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 86.75                                                                                     

Iteration: 4 of 10:
{'num_features': 250, 'C': 0.35, 'ngram_lens': range(1, 11), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 6), 'feature_selection': 'entropy'}
cv score: 87.25   

In [18]:
grid_df

Unnamed: 0,train,cv_mean,cv_std,num_features,C,ngram_lens,suffix_lens,prefix_lens,feature_selection,n_bags,hyperparams
0,89.833,86.375,3.10954,250,0.35,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(1, 2, 3, 4, 5)","(1, 2, 3, 4, 5)",entropy,30,"{'num_features': 250, 'C': 0.35, 'ngram_lens':..."


In [132]:
# Explore joint distributions
from numba import njit, prange
from numba.typed import List

@njit(parallel=True)
def find_max_min_joint_pair(X):
    n_rows, n_cols = X_subset.shape

    # Shared arrays to store results across parallel threads
    max_vals = np.zeros(n_cols, dtype=np.float64)
    max_i = np.zeros(n_cols, dtype=np.int64)
    max_j = np.zeros(n_cols, dtype=np.int64)

    for i in prange(n_cols):
        local_max_val = -1.0
        local_max_j = -1
        for j in range(i + 1, n_cols):
            counts = np.zeros((2, 2), dtype=np.int32)
            
            for k in range(n_rows):
                xi = X_subset[k, i]
                xj = X_subset[k, j]
                counts[xi, xj] += 1

            # Normalized joint probability
            norm_counts = counts / n_rows
            min_val = np.min(norm_counts)

            if min_val > local_max_val:
                local_max_val = min_val
                local_max_j = j

        max_vals[i] = local_max_val
        max_i[i] = i
        max_j[i] = local_max_j

    # Now find global maximum across all threads
    global_max_idx = np.argmax(max_vals)
    return (max_i[global_max_idx], max_j[global_max_idx], max_vals[global_max_idx])

X_subset = model.X.iloc[:, :].to_numpy().astype(np.int32)

i, j, max_val = find_max_min_joint_pair(X_subset)

print(f"The pair (i={i}, j={j}) has the maximum minimum joint probability of {max_val:.4f}")

pd.crosstab(model.X.iloc[:600,19], model.X.iloc[:600,20], normalize = True)

NameError: name 'model' is not defined

#### NB Grid Search 1

In [None]:
substr_start = range(1,4)
substr_end = range(4,9)
suffix_start = range(1,4)
suffix_end = range(3,6)
prefix_start = range(1,4)
prefix_end = range(3,6)
num_top_words = [np.inf, 5000, 1000, 500]

grid = []
for num_top_words, substr_start, substr_end, suffix_start, suffix_end, prefix_start, prefix_end in product(num_top_words, substr_start, substr_end, suffix_start, suffix_end, prefix_start, prefix_end):

    if substr_start < substr_end and prefix_start < prefix_end and suffix_start < suffix_end:

        grid.append({
            'substr_lens': range(substr_start,substr_end), 
            'suffix_lens': range(prefix_start,prefix_end), 
            'prefix_lens': range(suffix_start,suffix_end), 
            'num_top_words' : num_top_words
        })

grid_search_results = grid_search_cv(NBLanguageClassifier, grid, df, test, k = 10)

interation: 0/3840:
{'substr_lens': range(1, 4), 'suffix_lens': range(1, 3), 'prefix_lens': range(1, 3), 'num_top_words': inf}
train score : 89.25
test score  : 82.286
cv_score: 72.417

interation: 1/3840:
{'substr_lens': range(1, 4), 'suffix_lens': range(1, 4), 'prefix_lens': range(1, 3), 'num_top_words': inf}
train score : 89.25
test score  : 82.286
cv_score: 67.0

interation: 2/3840:
{'substr_lens': range(1, 4), 'suffix_lens': range(1, 5), 'prefix_lens': range(1, 3), 'num_top_words': inf}
train score : 89.25
test score  : 82.286
cv_score: 70.0

interation: 3/3840:
{'substr_lens': range(1, 4), 'suffix_lens': range(2, 3), 'prefix_lens': range(1, 3), 'num_top_words': inf}
train score : 89.25
test score  : 82.286
cv_score: 71.333

interation: 4/3840:
{'substr_lens': range(1, 4), 'suffix_lens': range(2, 4), 'prefix_lens': range(1, 3), 'num_top_words': inf}
train score : 89.25
test score  : 82.286
cv_score: 71.083

interation: 5/3840:
{'substr_lens': range(1, 4), 'suffix_lens': range(2, 5

#### NB Grid Search 2

In [141]:
substr_start = range(1,4)
substr_end = range(6,9)
suffix_start = range(1,4)
suffix_end = range(4,7)
prefix_start = range(1,3)
prefix_end = range(4,7)
num_top_words = [7000, 6000, 5000, 4000, 3000]

grid = []
for num_top_words, substr_start, substr_end, suffix_start, suffix_end, prefix_start, prefix_end in product(num_top_words, substr_start, substr_end, suffix_start, suffix_end, prefix_start, prefix_end):

    if substr_start < substr_end and prefix_start < prefix_end and suffix_start < suffix_end:

        grid.append({
            'substr_lens': range(substr_start,substr_end), 
            'suffix_lens': range(prefix_start,prefix_end), 
            'prefix_lens': range(suffix_start,suffix_end), 
            'num_top_words' : num_top_words
        })

grid_search_results = grid_search_cv(NBLanguageClassifier, grid, df, test, k = 10)

interation: 0/2430:
{'substr_lens': range(1, 6), 'suffix_lens': range(1, 4), 'prefix_lens': range(1, 4), 'num_top_words': 7000}
train score : 93.25
test score  : 83.429
test large score: 81.306
cv_score: 67.917

interation: 1/2430:
{'substr_lens': range(1, 6), 'suffix_lens': range(1, 5), 'prefix_lens': range(1, 4), 'num_top_words': 7000}
train score : 93.25
test score  : 83.429
test large score: 81.306
cv_score: 68.083

interation: 2/2430:
{'substr_lens': range(1, 6), 'suffix_lens': range(1, 6), 'prefix_lens': range(1, 4), 'num_top_words': 7000}
train score : 93.25
test score  : 83.429
test large score: 81.306
cv_score: 69.0

interation: 3/2430:
{'substr_lens': range(1, 6), 'suffix_lens': range(2, 4), 'prefix_lens': range(1, 4), 'num_top_words': 7000}
train score : 93.25
test score  : 83.429
test large score: 81.306
cv_score: 73.5

interation: 4/2430:
{'substr_lens': range(1, 6), 'suffix_lens': range(2, 5), 'prefix_lens': range(1, 4), 'num_top_words': 7000}
train score : 93.25
test sco

In [None]:
# Save Grid Search
grid_df = pd.DataFrame(grid_search_results)
grid_df["test large score"] = grid_df["test large score"].apply(lambda x: next(iter(x)))
grid_df = grid_df.rename(columns={"test large score": "test_large"})
grid_df.to_csv('grid_search_results_2.csv')

In [None]:
# Load Grid Search Results
#grid_df = pd.DataFrame(grid_search_results)
grid_df = pd.read_csv('grid_search_results.csv').drop(columns=["Unnamed: 0"])
inf_replaced = [string.replace('inf', 'np.inf') for string in grid_df["hyperparams"]]
grid_df["hyperparams"] = grid_df["hyperparams"].str.replace("inf", "np.inf")
grid_df = pd.concat([grid_df, pd.DataFrame([eval(hp) for hp in inf_replaced])], axis = 1)

In [None]:
# Load Grid Search Results
#grid_df = pd.DataFrame(grid_search_results)
grid_df = pd.read_csv('grid_search_results_2.csv').drop(columns=["Unnamed: 0"])
inf_replaced = [string.replace('inf', 'np.inf') for string in grid_df["hyperparams"]]
grid_df["hyperparams"] = grid_df["hyperparams"].str.replace("inf", "np.inf")
grid_df = pd.concat([grid_df, pd.DataFrame([eval(hp) for hp in inf_replaced])], axis = 1)