In [1]:
# Import Necessary Modules for Data Preprocessing

# Used for loading in training data [Function #1 - load_raw_training_data()]
import pandas as pd
# Adjust column width settings to see all of the 'original_text' column
pd.set_option('max_colwidth', 400)
#Adjust notebook to display all rows if output is of a large dataframe
pd.set_option('display.max_rows', None)

# Used for replacing '-LRB-' and '-RRB-' with left and right parentheses in original text repectively [Function #2 - replace_LRB_and_RRB()]
import re

# Used for label value changing in preprocessing training data [Function #6 - preprocessing_training_data()]
import numpy as np

# Used for tokenization when creating score values against extraneous resourses [Function #8 - extraneous_score_calculation()]
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 

# Used for Parts-of-Speech tagging [Function #14 - POS_preprocessing()]
from nltk.tag import pos_tag

# Used for Lemmatization [Function #15 - lemma_preprocessing()]
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Used for vectorization [Function # ]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package punkt to /home/nruloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nruloff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nruloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nruloff/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nruloff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nruloff/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nruloff/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Load Raw WikiLarge Training Data from GitHub Repository
def load_raw_training_data():
    # WikiLarge Training Data is very large and was split into three CSV files load each of them in.
    textData_1 = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/WikiLarge_Train_part_1.csv')
    textData_2 = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/WikiLarge_Train_part_2.csv')
    textData_3 = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/WikiLarge_Train_part_3.csv')
    
    # Concatenate each of the parts together to get the original data in one dataframe
    text_data = pd.concat([textData_1, textData_2, textData_3], ignore_index=True)
    
    # Return concatenated dataframe
    return text_data

In [3]:
# After searching for the first few 'original_text' entries presented in text_data.head() - it was determined
# that '-LRB-' and '-RRB-' are left and right parentheses respectively. This function replaces those
# text strings with their respective symbols.
def replace_LRB_and_RRB(text):
  # Replace the substring "-LRB-" with "(" in input string
  new_string = re.sub("-LRB-", "(", text)

  # Replace the substring "-RRB-" with ")" in new_string
  second_string = re.sub("-RRB-", ")", new_string)

  # Return the output of the second replacement
  return second_string

In [4]:
# Based on the finding of '-LRB-' and '-RRB-', this function replaces every character in a string that is not a parentheses with no text
def find_parentheses(text):
    punctuation_string = obtain_non_Alphanumeric(text)
    new_string = re.sub("[^()]", "", punctuation_string)
    return new_string

In [5]:
# Function to find any uneven parentheses within the 'original_text' column
def determine_uneven_parentheses(text):
  # Use previously generated function to find all parentheses within the 'original_text' column
  parentheses_only = find_parentheses(text)

  # Generate a list containing a single string of a closed parentheses
  parentheses_string_list = ['()']

  # While any closed parentheses exist in the parentheses column
  while any(x in parentheses_only for x in parentheses_string_list):
    # Replace the closed parentheses with no text
    for paren in parentheses_string_list:
        parentheses_only = parentheses_only.replace(paren, "")

  # Output result as a boolean to determine if string parentheses_only has been reduce to an empty string
  result = not parentheses_only

  # Return boolean value as 0 or 1 - 0 indicating that the 'original_text' column has closed parentheses
  if result == False:
    return 1
  else:
    return 0

In [6]:
# Function to obtain any non-alphanumeric characters
def obtain_non_Alphanumeric(text):
  # replace all non-alphanumeric characters of an input string with no text to a new output called 'new_string'
  new_string, number_of_subs = re.subn("[a-zA-Z0-9]", "", text)

  # replace all white space characters of 'new_string' with no text to the output 'second_string'
  second_string, second_subs = re.subn("\s", "", new_string)

  # return 'second_string'
  return second_string

In [7]:
# Combined previous functions into large preprocessing function

# Additionally - find duplicate 'original_text' entries, find the mean of their label values - as some of them have
# opposing label values - then remove all duplicates except for one with the mean label value adjusted to 0 or 1
# based on rounding
def preprocessing_training_data():
    text_data = load_raw_training_data()
    
    # Convert binary 0 labels to -1 (this helps with keeping some of the duplicate entries by mean value calculation)
    text_data['label'] = np.where(text_data['label'] < 1, -1, 1)
    
    # Separate out the duplicate entries from the individual entries - All duplicate entries are taken to "duplicate_texts" dataframe
    duplicate_texts = text_data[text_data.original_text.duplicated(keep=False)].copy()
    
    # Individual entries can be used directly in the final version of the preprocessed dataframe
    individual_texts = text_data[~text_data.original_text.duplicated(keep=False)].copy()
    
    # Group the duplicate text entries by the original text and find the mean value. 
    # If the mean value is negative, then most of the entries have been labeled as -1
    # If the mean value is positive, then most of the entries have been labeled as 1
    dup_group = duplicate_texts.groupby(['original_text'], as_index=False).mean()
    
    # Convert all positive values to 1 and all negative values to -1
    dup_group['label'] = np.where(dup_group['label'] > 0, 1, dup_group['label'])
    dup_group['label'] = np.where(dup_group['label'] < 0, -1, dup_group['label'])
    
    # If the label mean is 0, then it is an even split, and the text data cannot be used for classification
    # Identify all rows with mean groupby label values of 0
    zero_mean = dup_group[dup_group['label'] == 0]
    
    # Identify all rows with positive groupby label values
    pos_mean = dup_group[dup_group['label'] > 0].copy()
    
    # Perform the same for all rows with negative groupby label values
    neg_mean = dup_group[dup_group['label'] < 0].copy()
    
    # Recombine the acceptable duplicate entries with the original individual entries
    new_text_data = pd.concat([pos_mean, neg_mean, individual_texts], ignore_index=True)
    
    # Convert the labels of -1 back to 0 as in the original training data
    new_text_data['label'] = np.where(new_text_data['label'] < 0, 0, 1)
    
    # Replace "-LRB-" and "-RRB-" with left and right parentheses
    new_text_data['original_text'] = new_text_data.original_text.apply(lambda x: replace_LRB_and_RRB(x))
    
    # Create a Column of only the punctuation using previously made function
    new_text_data['punctuation'] = new_text_data.original_text.apply(lambda x: obtain_non_Alphanumeric(x))
    
    # Determine if a text entry has closed parentheses or not
    new_text_data['closed_parentheses'] = new_text_data.original_text.apply(lambda x: determine_uneven_parentheses(x))
    
    return new_text_data

In [8]:
# Function which loads external resouce data provided with the WikiLarge data - those resources are:
# 1) The Dale Chall 3000 Word List, which is one definition of words that are considered "basic" English.
# 2) "Age of Acquisition" (AoA) estimates for about 51k English words, which refers to the approximate age (in years) when a word was learned. Early words, being more basic, have lower average AoA.
# 3) Brysbaert et al Concreteness Ratings for 40 thousand English lemma words gathered via 
#    Amazon Mechanical Turk. The ratings come from a larger list of 63 thousand words and represent all English words known to 85% of the raters.

def load_external_resource_data():
    # Load Dale Chall word list as a list of strings
    dale_chall = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/dale_chall.txt', header=None)
    d_c_df = dale_chall.rename(columns={0:'words'})
    d_c_list = d_c_df['words'].to_list()
    
    # Load AoA estimates for about 51 thousand English words, and return it as a dictionary
    AoA = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/AoA_51715_words.csv', encoding='unicode_escape')
    # Reduce the dataframe to the word and the AoA_Kup_lem score
    AoA = AoA[['Word', 'AoA_Kup_lem']]
    # Drop any rows where the AoA_Kup_lem score is not a value
    AoA = AoA[AoA['AoA_Kup_lem'].notna()]
    # Set the index of the dataframe to the words
    AoA = AoA.set_index('Word')
    # Take the AoA_Kup_lem score series out as a dictionary
    AoA_dict = AoA['AoA_Kup_lem'].to_dict()
    
    # Load Brysbaert Concreteness ratings, and return it as a dictionary
    Brysbaert = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/Concreteness_ratings_Brysbaert_et_al_BRM.txt', delimiter='\t')
    # Reduce the dataframe to the word and Concreteness rating
    Brysbaert = Brysbaert[['Word','Conc.M']]
    # Remove any words that do not have a concreteness rating
    Brysbaert = Brysbaert[Brysbaert['Conc.M'].notna()]
    # Set the dataframe index to the word
    Brysbaert = Brysbaert.set_index('Word')
    # Take the Concreteness rating series out as a dictionary
    Brysbaert_dict = Brysbaert['Conc.M'].to_dict()
    
    return d_c_list, AoA_dict, Brysbaert_dict

In [9]:
# Define a function to calculate a score based on values from an external resource
def extraneous_score_calculation(text, extraneous_dict):
    # Replace all non-alphanumeric characters with a space, then make the letters lowercase, and 
    # subsequently tokenize the words
    tokens = nltk.word_tokenize((re.sub("[^a-zA-Z0-9 ]", " ", text)).lower())
    # Create an empty array to add score values into
    score_array = []
    
    # For each token in the tokenize 'original_text'
    for tok in tokens:
        # Try to find the token in the extraneous dictionary and append its score to the array
        try:
            ind_score = extraneous_dict[tok]
            score_array.append(ind_score)
        # If unable to find the token, append a value of 0 to the array
        except:
            score_array.append(0)
        # Return a normalized score for the 'original_text' column by summing the scores together and dividing by 
        # the total number of tokens.
        return np.sum(score_array)/len(score_array)

In [10]:
# Use extraneous_score_calculation function to calculate AoA and Brysbaert Concreteness Scores
def get_AoA_Brysbaert_features(new_text_data):
    # Calculate AoA Score using extraneous_score_calculation function
    new_text_data['AoA_score'] = new_text_data.original_text.apply(lambda x: extraneous_score_calculation(x, AoA_dict))
    # Calculate Brysbaert Score using extraneous_score_calculation function
    new_text_data['Brysbaert_score'] = new_text_data.original_text.apply(lambda x: extraneous_score_calculation(x, Brysbaert_dict))
    
    # Convert NaN in Both 'AoA_score' and 'Brysbaert_score' columns
    new_text_data['AoA_score'] = new_text_data['AoA_score'].fillna(0)
    new_text_data['Brysbaert_score'] = new_text_data['Brysbaert_score'].fillna(0)
    
    return new_text_data

In [11]:
# Acquire additional features such as:
# 1) Normalized proportion of word tokens from Dale Chall list in 'original_text' column
# 2) Number of tokens in 'original_text' column
# 3) Average length of each word token in 'original_text' column
# 4) Largest length of a word token in 'original_text' column
# 5) Normalized proportion of non-alphanumeric characters in 'original_text' column
# 6) Normalized proportion of decimal digit characters in 'original_text' column
def get_more_features(list_of_docs, easy_word_list):
    # List for number of word tokens in text passage
    num_toks_l = []
    # List for number of dale_chall terms in text passage, normalized to length of text passage
    d_c_norm_l = []
    # Value of average word length for a textual passage
    avg_tok_len_l = []
    # Value of max word length for a textual passage
    max_tok_len_l = []
    # Number of Non-alphanumeric characters
    non_alpha_char_l =[]
    # Number of Characters total
    numbers_norm_l = [] #add ratio of number charicters to total
    # Generate a set of words based on the second input of the function (a list of words)
    s2=set(easy_word_list)
    
    
    for doc in list_of_docs:
        # Convert all letters to lowercase
        doc = doc.lower()
        #-------------------
        chars = re.findall('[^a-zA-Z0-9 ]', doc) # Find all non-alphanumeric characters (except whitespace)
        non_alpha = len(chars)/len(doc) # Calculate a Normalized Ratio of the number of non-alphanumeric characters to the length of the entire text passage
        non_alpha_char_l.append(non_alpha) # Append this ratio to the previously generated list
        #-------------------------------
        num_chars = re.findall('\d', doc) # Find all decimal digit characters
        numbers_norm = len(num_chars)/len(doc) # Calculate the normalized ratio to the length of the entire text passage
        numbers_norm_l.append(numbers_norm) # Append the calculated ratio to previously generated list
        #------------------------------
        toks = nltk.word_tokenize(doc) # Generate word tokens for each text passage using nltk.tokenize.word_tokenize
        num_toks = len(toks) # Count the number of tokens
        num_toks_l.append(num_toks) # Append the token count to previously generated list
        #------------------------------
        temp_list = [] # Create an empty temporary list
        # For each token created from word_tokenize
        for tok in toks:
            # Determine the length of the token, and append that length to the temporary list
            temp_list.append(len(tok))
            
        # Find the average token length
        avg_tok = sum(temp_list)/len(temp_list)
        # Append the average token length to previously generated list
        avg_tok_len_l.append(avg_tok)
        # Find the maximum token length
        max_t = max(temp_list)
        # Append the maximum token length to previously generated list
        max_tok_len_l.append(max_t)
        #------------------------------
        s1= set(toks) # Generate a set of tokens from previously made list of tokens
        num_d_c = len(s1.intersection(s2)) # Calculate the number of words that are also contained in the set of 'simple words' made previously
        d_c_norm = num_d_c/num_toks # Normalize the value to the total number of tokens
        d_c_norm_l.append(d_c_norm) # Append that normalized value to previously generated list
        #------------------------------
        
    # Generate an array of new features which can be added to the dataframe
    new_features = np.vstack(( np.asarray(d_c_norm_l), np.asarray(num_toks_l),  np.asarray(avg_tok_len_l), np.asarray(max_tok_len_l), 
                   np.asarray(numbers_norm_l), np.asarray(non_alpha_char_l) )).T

    return new_features

In [12]:
# Function to count the number of a specific character within an 'original_text' column
# This function designed to help with identifying non-alphanumeric characters as special characters
# can have issue when using regex to search for them.
def count_num_of_specific_char(text, char_of_interest):
    # Reformat non-alphanumeric character as a set contained in brackets
    reformat_char = '[' + char_of_interest + ']'
    # Find all occurences of the character in a text, and count the total number of them
    num_specific_char = len(re.findall(reformat_char, text))
    # Return the total count of the non-alphanumeric characters
    return num_specific_char

In [13]:
# Function to count the total number of non-whitespace characters
def count_num_of_non_ws(text):
    # Replace all whitespace characters with no text
    new_string = re.sub("\s", "", text)
    
    # Return the count the length of the new non-whitespace string
    return len(new_string)

In [14]:
# Make Individual feature columns for the normalized proportion of the punctuation/non-alphanumeric characters
def get_punctuation_features(text_data):
    # Acquire all of the non-alphanumeric characters in a set
    all_punctuation = set(text_data.punctuation.sum())
    
    # Create a new empty list to track all columns added to dataframe
    new_columns = []
    
    # For each non-alphanumeric character
    for punc_mark in all_punctuation:
        # Create a new string for a potential column name
        new_col_name = 'norm_' + punc_mark
        
        # Try to count the number of entries of the specific character, and if so, add the name of the column to the list of new column names
        try:
            text_data[new_col_name] = text_data.punctuation.apply(lambda x: count_num_of_specific_char(x, punc_mark))
            text_data[new_col_name] = text_data[new_col_name] / text_data['num_non_ws_char']
            new_columns.append(new_col_name)
        # If there are issues, then continue to the next non-alphanumeric character
        except:
            continue
    
    # Return the dataframe and a list of these new columns
    return text_data, new_columns

In [15]:
# Function to transform a text into an array of Parts-of-Speech (POS)
def POS_preprocessing(text):
    # Replace all non-alphanumeric or common punctuation with no text and output into a new string
    new_string = re.sub('[^a-zA-Z0-9 ,.!;:?()]', '', text)
    # Tokenize the new string
    word_tokens = nltk.word_tokenize(new_string)
    # Obtain the parts of speech tags for each of the words and put them into a list
    pos_tag_tokens = [pair[1] for pair in nltk.pos_tag(word_tokens)]
    # Concatenate each POS tag together into a single string
    pos_tag_tokens = " ".join(pos_tag_tokens)
    # Return the single string of POS tags
    return pos_tag_tokens

In [16]:
# Lemmatize and tokenize text
def lemma_preprocessing(text):
    lemmatizer = nltk.WordNetLemmatizer()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    return text

In [17]:
new_text_data = preprocessing_training_data()
d_c_list, AoA_dict, Brysbaert_dict = load_external_resource_data()
new_text_data = get_AoA_Brysbaert_features(new_text_data)
new_features = get_more_features(new_text_data['original_text'], d_c_list)
new_text_data[['d_c_norm_1', 'num_toks_1', 'avg_tok_len_1', 'max_tok_len_1', 'num_char_norm_1', 'non_alphanumeric_1']] = new_features
new_text_data['num_non_ws_char'] = new_text_data['original_text'].apply(lambda x: count_num_of_non_ws(x))
new_text_data, punc_cols = get_punctuation_features(new_text_data)
new_text_data['pos_tag_tokens'] = new_text_data['original_text'].apply(lambda x: POS_preprocessing(x))
new_text_data['num_pos_tokens'] = new_text_data['pos_tag_tokens'].apply(lambda x: len(x.split()))
new_text_data['lemma_text'] = new_text_data['original_text'].apply(lambda x: lemma_preprocessing(x))
new_text_data = new_text_data.drop(columns=['punctuation'])
#new_text_data.head()

  num_specific_char = len(re.findall(reformat_char, text))
  text_data[new_col_name] = text_data.punctuation.apply(lambda x: count_num_of_specific_char(x, punc_mark))
  new_text_data['pos_tag_tokens'] = new_text_data['original_text'].apply(lambda x: POS_preprocessing(x))
  new_text_data['num_pos_tokens'] = new_text_data['pos_tag_tokens'].apply(lambda x: len(x.split()))
  new_text_data['lemma_text'] = new_text_data['original_text'].apply(lambda x: lemma_preprocessing(x))


In [18]:
def determine_X_feat(df):
    all_features = df.columns.to_list()
    X_feat = []
    for feat in all_features:
        if feat != 'label':
            X_feat.append(feat)
    return X_feat

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

def scikit_column_transformer(text_df = new_text_data, text_type = 'original_text', vector_type = 'Count', scaler='Robust', ngrams_value=1, max_features_value=None, sequence_length=500, 
                              test_size=0.2, random_state=21):
    # Reduce the input dataframe to only include either the original_text or lemma_text columns
    if text_type == 'original_text':
        final_text_df = text_df.drop(columns=['lemma_text'])
        final_text_df = final_text_df.rename(columns={'original_text': 'text'})
    elif text_type == 'lemma_text':
        final_text_df = text_df.drop(columns=['original_text'])
        final_text_df = final_text_df.rename(columns={'lemma_text': 'text'})
    else:
        return 'Incorrect input for text_type argument'

    # Perform the Train-Test Split Based on Input Data
    X_feat = determine_X_feat(final_text_df)
    X_train, X_test, y_train, y_test = train_test_split(final_text_df[X_feat], final_text_df['label'], test_size=test_size, random_state=random_state)
  
    # Select Vectors for text data and POS data
    if vector_type == 'Count':
        text_vector = CountVectorizer(ngram_range=(1, ngrams_value),max_features=max_features_value)
        pos_vector = CountVectorizer(ngram_range=(1, ngrams_value), preprocessor=None, token_pattern=r'[^\s]+', lowercase=False)
    elif vector_type == 'Tfidf':
        text_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), max_features=max_features_value)
        pos_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), token_pattern=r'[^\s]+', lowercase=False)
    elif vector_type == 'Binary':
        text_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value),max_features=max_features_value)
        pos_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value), preprocessor=None, token_pattern=r'[^\s]+', lowercase=False)
    else:
        return 'Incorrect input for vector_type argument'

    # Select the desired scaler based on input string
    dict_of_scalers = {'Robust': RobustScaler(), 'MinMax': MinMaxScaler() , 'Standard': StandardScaler()}
    try:
        selected_feature_scaler = dict_of_scalers[scaler]
    except:
        return 'Incorrect input for scaler argument - must be either Count, MinMax or Standard'
  
    # Use Scikit-Learn Column Transformer to vectorize the text data and the POS data, and transform the additional features by selected scaler
    column_trans = ColumnTransformer([('vector_text', text_vector, 'text'), 
                                      ('vector_pos_tags', pos_vector, 'pos_tag_tokens')], 
                                     remainder = selected_feature_scaler)
    # Perform Fit_Transform on X_train and transform on X_test
    X_train_matrix = column_trans.fit_transform(X_train)
    X_test_matrix = column_trans.transform(X_test)

    return column_trans, X_train_matrix, y_train, X_test_matrix, y_test

In [20]:
tfidf_trans, X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf = scikit_column_transformer(vector_type='Tfidf')
binary_trans, X_train_bin, y_train_bin, X_test_bin, y_test_bin = scikit_column_transformer(vector_type='Binary')
count_trans, X_train_count, y_train_count, X_test_count, y_test_count = scikit_column_transformer(scaler='MinMax')

In [21]:
#ML_results_columns = ['train_acc', 'train_precision', 'train_recall', 'train_f1', 'train_roc_auc',
#                      'test_acc',  'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']

#ML_params_columns = ['model_ID', 'alpha', 'C', 'solver']

#ML_res_and_params = ML_params_columns + ML_results_columns

In [22]:
from sklearn import metrics

def obtain_comparison_metrics(y_true, y_pred):
    calc_accuracy = metrics.accuracy_score(y_true, y_pred)
    calc_precision = metrics.precision_score(y_true, y_pred)
    calc_recall = metrics.recall_score(y_true, y_pred)
    calc_f1 = metrics.f1_score(y_true, y_pred)
    
    return calc_accuracy, calc_precision, calc_recall, calc_f1

def obtain_train_and_test_metrics(y_train_true, y_train_pred, y_test_true, y_test_pred):
    (calc_train_accuracy, calc_train_precision, 
     calc_train_recall, calc_train_f1) = obtain_comparison_metrics(y_train_true, y_train_pred)
    (calc_test_accuracy, calc_test_precision, 
     calc_test_recall, calc_test_f1) = obtain_comparison_metrics(y_test_true, y_test_pred)
    output_dict = {'train_acc': calc_train_accuracy, 
                   'train_precision': calc_train_precision, 
                   'train_recall': calc_train_recall, 
                   'train_f1': calc_train_f1, 
                   'test_acc': calc_test_accuracy, 
                   'test_precision': calc_test_precision, 
                   'test_recall': calc_test_recall, 
                   'test_f1': calc_test_f1}
    return output_dict

def obtain_roc_auc_score(clf, X, y_true):
    y_score = clf.predict_proba(X)[:, 1]
    roc_auc_value = metrics.roc_auc_score(y_true.values, y_score)
    return roc_auc_value

## Bernoulli Naive Bayes Classifier

In [23]:
from sklearn.naive_bayes import BernoulliNB

#BernoulliNB_list = []
#index_val = 0

#for alpha_val in [0.01, 0.1, 1, 10]:
#    model_name = 'BernoulliNB_{}'.format(str(index_val))
#    clf = BernoulliNB(alpha=alpha_val)
#    clf.fit(X_train_bin, y_train_bin)
#    y_train_pred = clf.predict(X_train_bin)
#    y_test_pred = clf.predict(X_test_bin)
#    results_dict = obtain_train_and_test_metrics(y_train_bin, y_train_pred, y_test_bin, y_test_pred)
#    results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_bin, y_train_bin)
#    results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_bin, y_test_bin)
#    results_dict['model_ID'] = model_name
#    results_dict['alpha'] = alpha_val
#    BernoulliNB_list.append(results_dict)
#    index_val += 1
    
#ML_info_df = pd.DataFrame(data=BernoulliNB_list, columns=ML_res_and_params)

## Multinomial Naive Bayes Classifier

In [24]:
from sklearn.naive_bayes import MultinomialNB

#temp_ML_list = []
#index_val = 0

#for alpha_val in [0.01, 0.1, 1, 10]:
#    model_name = 'MultinominalNB_{}'.format(str(index_val))
#    clf = MultinomialNB(alpha=alpha_val)
#    clf.fit(X_train_count, y_train_count)
#    y_train_pred = clf.predict(X_train_count)
#    y_test_pred = clf.predict(X_test_count)
#    results_dict = obtain_train_and_test_metrics(y_train_count, y_train_pred, y_test_count, y_test_pred)
#    results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_count, y_train_count)
#    results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_count, y_test_count)
#    results_dict['model_ID'] = model_name
#    results_dict['alpha'] = alpha_val
#    temp_ML_list.append(results_dict)
#    index_val +=1
    
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=ML_res_and_params)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

## Logistic Regression Classifier

In [25]:
from sklearn.linear_model import LogisticRegression

#temp_ML_list = []
#index_val = 0

#for C_val in [0.01, 0.1, 1, 10]:
#    for solver_type in ['lbfgs', 'sag', 'saga']:
#        model_name = "LogReg_V{}".format(str(index_val))
#        clf = LogisticRegression(C=C_val, solver=solver_type, n_jobs=-1, random_state=0, max_iter=10000)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['C'] = C_val
#        results_dict['solver'] = solver_type
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=ML_res_and_params)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [26]:
#Vectorization_list = ['Binary', 'Binary', 'Binary', 'Binary', 
#                      'Count', 'Count', 'Count', 'Count', 
#                      'Tfidf', 'Tfidf', 'Tfidf', 'Tfidf', 
#                      'Tfidf', 'Tfidf', 'Tfidf', 'Tfidf', 
#                      'Tfidf', 'Tfidf', 'Tfidf', 'Tfidf']

#ML_info_df['Vectorization'] = Vectorization_list
#ML_info_df

In [27]:
#ML_info_df.to_csv('Current_model_information_19Feb2023.csv')

In [28]:
ML_results_columns = ['train_acc', 'train_precision', 'train_recall', 'train_f1', 'train_roc_auc',
                      'test_acc',  'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']

ML_params_columns = ['model_ID', 'alpha', 'C', 'solver', 'n_estimators', 'max_depth']

ML_res_and_params = ML_params_columns + ML_results_columns
#ML_info_df['n_estimators'] = np.nan
#ML_info_df['max_depth'] = np.nan

In [29]:
#ML_info_df = pd.read_csv('Current_model_information_19Feb2023.csv')
#ML_info_df

## Random Forest Classifier

In [30]:
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
#clf = RandomForestClassifier(n_estimators=10, max_depth=10, n_jobs=-1, random_state=0)

In [31]:
#clf.fit(X_train_tfidf, y_train_tfidf)

In [32]:
#temp_ML_list = []
#index_val = 0

#for n_estimators_val in tqdm([10, 100]):
#    for max_depth_val in tqdm([10, 100]):
#        model_name = "Random_Forest_{}".format(str(index_val))
#        clf = RandomForestClassifier(n_estimators=n_estimators_val, max_depth=max_depth_val, n_jobs=-1, random_state=0)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['n_estimators'] = n_estimators_val
#        results_dict['max_depth'] = max_depth_val
#        results_dict['Vectorization'] = 'Tfidf'
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=ML_res_and_params)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [33]:
#temp_list = ML_info_df.columns.tolist()
#temp_list

In [34]:
#ML_info_df = ML_info_df.drop(columns=[temp_list[0]])
#ML_info_df

In [35]:
#current_columns = ML_info_df.columns.to_list()
#current_columns.append('learning_rate')
#current_columns

## Gradient Boosted Decision Trees

In [36]:
#from sklearn.ensemble import GradientBoostingClassifier

#temp_ML_list = []
#index_val = 0

#for n_estimators_val in tqdm([10]):
#    for learning_rate_val in tqdm([0.01, 0.1, 1, 10]):
#        model_name = "Gradient_Boosted_{}".format(str(index_val))
#        clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['n_estimators'] = n_estimators_val
#        results_dict['learning_rate'] = learning_rate_val
#        results_dict['Vectorization'] = 'Tfidf'
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=current_columns)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [37]:
#temp_ML_list = []

#for n_estimators_val in tqdm([100]):
#    for learning_rate_val in tqdm([0.01, 0.1, 1]):
#        model_name = "Gradient_Boosted_{}".format(str(index_val))
#        clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['n_estimators'] = n_estimators_val
#        results_dict['learning_rate'] = learning_rate_val
#        results_dict['Vectorization'] = 'Tfidf'
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=current_columns)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [38]:
#temp_ML_list = []

#for n_estimators_val in tqdm([100]):
#    for learning_rate_val in tqdm([1.01, 1.1]):
#        model_name = "Gradient_Boosted_{}".format(str(index_val))
#        clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['n_estimators'] = n_estimators_val
#        results_dict['learning_rate'] = learning_rate_val
#        results_dict['Vectorization'] = 'Tfidf'
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=current_columns)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [39]:
#temp_ML_list = []

#for n_estimators_val in tqdm([100]):
#    for learning_rate_val in tqdm([10]):
#        model_name = "Gradient_Boosted_{}".format(str(index_val))
#        clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['n_estimators'] = n_estimators_val
#        results_dict['learning_rate'] = learning_rate_val
#        results_dict['Vectorization'] = 'Tfidf'
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=current_columns)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [40]:
#temp_ML_list = []

#for n_estimators_val in tqdm([200]):
#    for learning_rate_val in tqdm([0.01, 0.1, 1, 10]):
#        model_name = "Gradient_Boosted_{}".format(str(index_val))
#        clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
#        clf.fit(X_train_tfidf, y_train_tfidf)
#        y_train_pred = clf.predict(X_train_tfidf)
#        y_test_pred = clf.predict(X_test_tfidf)
#        results_dict = obtain_train_and_test_metrics(y_train_tfidf, y_train_pred, y_test_tfidf, y_test_pred)
#        results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
#        results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
#        results_dict['model_ID'] = model_name
#        results_dict['n_estimators'] = n_estimators_val
#        results_dict['learning_rate'] = learning_rate_val
#        results_dict['Vectorization'] = 'Tfidf'
#        temp_ML_list.append(results_dict)
#        index_val += 1
        
#temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=current_columns)

#ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
#ML_info_df

In [41]:
#ML_info_df.to_csv('Current_model_information_20Feb2023.csv')

In [42]:
ML_info_df = pd.read_csv('Current_model_information_20Feb2023.csv')
ML_info_df

Unnamed: 0.1,Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate
0,0,BernoulliNB_0,0.01,,,0.684143,0.689154,0.669365,0.679115,0.768589,0.589341,0.586875,0.573144,0.579928,0.619243,Binary,,,
1,1,BernoulliNB_1,0.1,,,0.67061,0.676,0.653611,0.664617,0.751843,0.609047,0.607972,0.589911,0.598805,0.651346,Binary,,,
2,2,BernoulliNB_2,1.0,,,0.65388,0.660473,0.631441,0.645631,0.729765,0.626976,0.627493,0.604837,0.615957,0.68155,Binary,,,
3,3,BernoulliNB_3,10.0,,,0.642958,0.654526,0.603514,0.627986,0.711188,0.635355,0.641825,0.594467,0.617239,0.696557,Binary,,,
4,4,MultinominalNB_0,0.01,,,0.745344,0.767017,0.703787,0.734043,0.826907,0.549178,0.546105,0.524011,0.53483,0.526223,Count,,,
5,5,MultinominalNB_1,0.1,,,0.742246,0.763469,0.700974,0.730888,0.819968,0.561906,0.559498,0.53701,0.548024,0.563574,Count,,,
6,6,MultinominalNB_2,1.0,,,0.726751,0.740882,0.696301,0.7179,0.796255,0.606793,0.606231,0.584857,0.595352,0.637564,Count,,,
7,7,MultinominalNB_3,10.0,,,0.662967,0.614246,0.873781,0.72138,0.764139,0.620012,0.578778,0.851142,0.689021,0.714669,Count,,,
8,8,LogReg_V0,,0.01,lbfgs,0.666398,0.664549,0.670225,0.667375,0.726283,0.662458,0.657518,0.662704,0.660101,0.722717,Tfidf,,,
9,9,LogReg_V1,,0.01,sag,0.666608,0.664908,0.669972,0.66743,0.72632,0.662386,0.657423,0.662704,0.660053,0.722694,Tfidf,,,


In [43]:
tfidf_trans_2, X_train_tfidf_2, y_train_tfidf_2, X_test_tfidf_2, y_test_tfidf_2 = scikit_column_transformer(vector_type='Tfidf', ngrams_value=2)
binary_trans_2, X_train_bin_2, y_train_bin_2, X_test_bin_2, y_test_bin_2 = scikit_column_transformer(vector_type='Binary', ngrams_value=2)
count_trans_2, X_train_count_2, y_train_count_2, X_test_count_2, y_test_count_2 = scikit_column_transformer(scaler='MinMax', ngrams_value=2)

tfidf_trans_3, X_train_tfidf_3, y_train_tfidf_3, X_test_tfidf_3, y_test_tfidf_3 = scikit_column_transformer(vector_type='Tfidf', ngrams_value=3)
binary_trans_3, X_train_bin_3, y_train_bin_3, X_test_bin_3, y_test_bin_3 = scikit_column_transformer(vector_type='Binary', ngrams_value=3)
count_trans_3, X_train_count_3, y_train_count_3, X_test_count_3, y_test_count_3 = scikit_column_transformer(scaler='MinMax', ngrams_value=3)

In [50]:
ML_info_df = pd.read_csv('Current_model_information_20Feb2023.csv')
temp_list = ML_info_df.columns.tolist()
ML_info_df = ML_info_df.drop(columns=[temp_list[0]])
ML_info_df['ngrams_val'] = 1
ML_info_df = ML_info_df.sort_values(['test_acc'], ascending=False)
ML_info_df

Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
36,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1
32,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1
31,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1
30,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1
16,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1
35,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1
19,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1
11,LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1
13,LogReg_V5,,0.1,saga,0.698901,0.697756,0.70038,0.699066,0.766192,0.685573,0.68144,0.684028,0.682731,0.750606,Tfidf,,,,1
12,LogReg_V4,,0.1,sag,0.700111,0.698899,0.701755,0.700324,0.7676,0.685559,0.681515,0.683765,0.682638,0.751067,Tfidf,,,,1


In [51]:
all_columns = ML_info_df.columns.to_list()
all_columns

['model_ID',
 'alpha',
 'C',
 'solver',
 'train_acc',
 'train_precision',
 'train_recall',
 'train_f1',
 'train_roc_auc',
 'test_acc',
 'test_precision',
 'test_recall',
 'test_f1',
 'test_roc_auc',
 'Vectorization',
 'n_estimators',
 'max_depth',
 'learning_rate',
 'ngrams_val']

In [54]:
from sklearn.ensemble import GradientBoostingClassifier
temp_ML_list = []
index_val = 14
tfidf_ngram_test_tuples = [(X_train_tfidf_2, y_train_tfidf_2, X_test_tfidf_2, y_test_tfidf_2),
                           (X_train_tfidf_3, y_train_tfidf_3, X_test_tfidf_3, y_test_tfidf_3)]

for n_estimators_val in tqdm([100]):
    for learning_rate_val in tqdm([0.01, 0.1, 1]):
        for ngrams_tuple_val in np.arange(2):
            ngrams_tuple = tfidf_ngram_test_tuples[ngrams_tuple_val]
            X_train_temp = ngrams_tuple[0]
            y_train_temp = ngrams_tuple[1]
            X_test_temp = ngrams_tuple[2]
            y_test_temp = ngrams_tuple[3]
            model_name = "Gradient_Boosted_{}".format(str(index_val))
            clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
            clf.fit(X_train_temp, y_train_temp)
            y_train_pred = clf.predict(X_train_temp)
            y_test_pred = clf.predict(X_test_temp)
            results_dict = obtain_train_and_test_metrics(y_train_temp, y_train_pred, y_test_temp, y_test_pred)
            results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_temp, y_train_temp)
            results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_temp, y_test_temp)
            results_dict['model_ID'] = model_name
            results_dict['n_estimators'] = n_estimators_val
            results_dict['learning_rate'] = learning_rate_val
            results_dict['Vectorization'] = 'Tfidf'
            results_dict['ngrams_val'] = ngrams_tuple_val
            temp_ML_list.append(results_dict)
            index_val += 1
        
temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=all_columns)

ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
ML_info_df

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [52:58<1:45:56, 3178.34s/it][A
 67%|██████▋   | 2/3 [1:43:56<51:47, 3107.90s/it][A
100%|██████████| 3/3 [2:33:50<00:00, 3076.77s/it][A
100%|██████████| 1/1 [2:33:50<00:00, 9230.30s/it]


Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
0,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1
1,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1
2,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1
3,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1
4,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1
5,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1
6,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1
7,LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1
8,LogReg_V5,,0.1,saga,0.698901,0.697756,0.70038,0.699066,0.766192,0.685573,0.68144,0.684028,0.682731,0.750606,Tfidf,,,,1
9,LogReg_V4,,0.1,sag,0.700111,0.698899,0.701755,0.700324,0.7676,0.685559,0.681515,0.683765,0.682638,0.751067,Tfidf,,,,1


In [55]:
ML_info_df = ML_info_df.sort_values(['test_acc'], ascending=False)
ML_info_df

Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
43,Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,1
42,Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,0
0,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1
1,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1
2,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1
3,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1
4,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1
5,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1
6,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1
7,LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1


In [67]:
ML_info_df = ML_info_df.set_index('model_ID')
ML_info_df

Unnamed: 0_level_0,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
model_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,1.0
Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,0.0
Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1.0
Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1.0
Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1.0
Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1.0
LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1.0
Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1.0
LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1.0
LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1.0


In [68]:
ML_info_df.at['Gradient_Boosted_14', 'ngrams_val'] = 2
ML_info_df.at['Gradient_Boosted_15', 'ngrams_val'] = 3
ML_info_df.at['Gradient_Boosted_16', 'ngrams_val'] = 2
ML_info_df.at['Gradient_Boosted_17', 'ngrams_val'] = 3
ML_info_df.at['Gradient_Boosted_18', 'ngrams_val'] = 2
ML_info_df.at['Gradient_Boosted_19', 'ngrams_val'] = 3

In [69]:
ML_info_df.head()

Unnamed: 0_level_0,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
model_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,3.0
Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,2.0
Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1.0
Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1.0
Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1.0


In [70]:
ML_info_df.to_csv('Current_model_information_20Feb2023_V5.csv')

In [72]:
ML_info_df = ML_info_df.reset_index()
ML_info_df

Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
0,Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,3.0
1,Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,2.0
2,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1.0
3,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1.0
4,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1.0
5,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1.0
6,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1.0
7,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1.0
8,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1.0
9,LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1.0


In [74]:
ML_info_df = ML_info_df.iloc[:-6]
ML_info_df

Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
0,Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,3.0
1,Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,2.0
2,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1.0
3,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1.0
4,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1.0
5,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1.0
6,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1.0
7,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1.0
8,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1.0
9,LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1.0


In [75]:
temp_ML_list = []
index_val = 20
tfidf_ngram_test_tuples = [(X_train_tfidf_2, y_train_tfidf_2, X_test_tfidf_2, y_test_tfidf_2),
                           (X_train_tfidf_3, y_train_tfidf_3, X_test_tfidf_3, y_test_tfidf_3)]

for n_estimators_val in tqdm([200]):
    for learning_rate_val in tqdm([1]):
        for ngrams_tuple_val in np.arange(1):
            ngrams_tuple = tfidf_ngram_test_tuples[ngrams_tuple_val]
            X_train_temp = ngrams_tuple[0]
            y_train_temp = ngrams_tuple[1]
            X_test_temp = ngrams_tuple[2]
            y_test_temp = ngrams_tuple[3]
            model_name = "Gradient_Boosted_{}".format(str(index_val))
            clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
            clf.fit(X_train_temp, y_train_temp)
            y_train_pred = clf.predict(X_train_temp)
            y_test_pred = clf.predict(X_test_temp)
            results_dict = obtain_train_and_test_metrics(y_train_temp, y_train_pred, y_test_temp, y_test_pred)
            results_dict['train_roc_auc'] = obtain_roc_auc_score(clf, X_train_temp, y_train_temp)
            results_dict['test_roc_auc'] = obtain_roc_auc_score(clf, X_test_temp, y_test_temp)
            results_dict['model_ID'] = model_name
            results_dict['n_estimators'] = n_estimators_val
            results_dict['learning_rate'] = learning_rate_val
            results_dict['Vectorization'] = 'Tfidf'
            results_dict['ngrams_val'] = (ngrams_tuple_val + 2)
            temp_ML_list.append(results_dict)
            index_val += 1
        
temp_ML_info_df = pd.DataFrame(data=temp_ML_list, columns=all_columns)

ML_info_df = pd.concat([ML_info_df, temp_ML_info_df], ignore_index=True)
ML_info_df

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [34:27<00:00, 2067.28s/it][A
100%|██████████| 1/1 [34:27<00:00, 2067.29s/it]


Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
0,Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,3.0
1,Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,2.0
2,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1.0
3,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1.0
4,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1.0
5,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1.0
6,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1.0
7,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1.0
8,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1.0
9,LogReg_V3,,0.1,lbfgs,0.701317,0.699966,0.703303,0.70163,0.769428,0.68592,0.68143,0.685371,0.683395,0.751441,Tfidf,,,,1.0


In [76]:
ML_info_df = ML_info_df.sort_values(['test_acc'], ascending=False)
ML_info_df.to_csv('Current_model_information_20Feb2023_V6.csv')

In [77]:
ML_info_df

Unnamed: 0,model_ID,alpha,C,solver,train_acc,train_precision,train_recall,train_f1,train_roc_auc,test_acc,test_precision,test_recall,test_f1,test_roc_auc,Vectorization,n_estimators,max_depth,learning_rate,ngrams_val
44,Gradient_Boosted_20,,,,0.728615,0.721749,0.742919,0.732181,0.811902,0.701711,0.696157,0.704271,0.70019,0.777393,Tfidf,200.0,,1.0,2.0
0,Gradient_Boosted_19,,,,0.716714,0.708277,0.735686,0.721721,0.796982,0.698908,0.692257,0.704329,0.698241,0.774761,Tfidf,100.0,,1.0,3.0
1,Gradient_Boosted_18,,,,0.714348,0.705792,0.733834,0.71954,0.794199,0.697853,0.689334,0.708302,0.698689,0.774788,Tfidf,100.0,,1.0,2.0
2,Gradient_Boosted_12,,,,0.722984,0.717358,0.734709,0.72593,0.803206,0.697579,0.689662,0.706403,0.697932,0.772152,Tfidf,200.0,,1.0,1.0
3,Gradient_Boosted_8,,,,0.709703,0.704716,0.720554,0.712547,0.786444,0.696827,0.689548,0.703949,0.696674,0.768681,Tfidf,100.0,,1.1,1.0
4,Gradient_Boosted_7,,,,0.710028,0.704347,0.722601,0.713357,0.787408,0.696596,0.688434,0.706111,0.697161,0.771293,Tfidf,100.0,,1.01,1.0
5,Gradient_Boosted_6,,,,0.70914,0.702284,0.724742,0.713336,0.78652,0.696582,0.687157,0.709558,0.698178,0.769949,Tfidf,100.0,,1.0,1.0
6,LogReg_V8,,1.0,saga,0.717606,0.71754,0.716518,0.717029,0.789683,0.687321,0.684657,0.681837,0.683244,0.755119,Tfidf,,,,1.0
7,Gradient_Boosted_11,,,,0.690778,0.681064,0.716055,0.698121,0.763517,0.686758,0.674024,0.710054,0.69157,0.759933,Tfidf,200.0,,0.1,1.0
8,LogReg_V11,,10.0,saga,0.723847,0.724117,0.722058,0.723086,0.797091,0.686209,0.684105,0.679149,0.681618,0.753444,Tfidf,,,,1.0
