In [None]:
import pickle

feature_ablation_dict_files = ['overall_dict_23Feb2023.pkl', 'overall_dict_23Feb2023_Part_2.pkl', 
                               'overall_dict_24Feb2023_Part_1.pkl', 'overall_dict_24Feb2023_Part_2.pkl', 
                               'overall_dict_25Feb2023_Part_1.pkl']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Generate the base file path name for loading the pickle file
base_path = '/content/drive/My Drive/'

Mounted at /content/drive


In [None]:
# Import Necessary Modules for Data Preprocessing

# Used for loading in training data [Function #1 - load_raw_training_data()]
import pandas as pd
# Adjust column width settings to see all of the 'original_text' column
pd.set_option('max_colwidth', 400)

# Used for replacing '-LRB-' and '-RRB-' with left and right parentheses in original text repectively [Function #2 - replace_LRB_and_RRB()]
import re

# Used for label value changing in preprocessing training data [Function #6 - preprocessing_training_data()]
import numpy as np

# Used for tokenization when creating score values against extraneous resourses [Function #8 - extraneous_score_calculation()]
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 

# Used for Parts-of-Speech tagging [Function #14 - POS_preprocessing()]
from nltk.tag import pos_tag

# Used for Lemmatization [Function #15 - lemma_preprocessing()]
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Used for vectorization [Function # ]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
recombined_dictionary = {}

for overall_dict_file in feature_ablation_dict_files:
  overall_dict_file = base_path + overall_dict_file
  with open(overall_dict_file, 'rb') as handle:
    temp_dict = pickle.load(handle)
    key_list = list(temp_dict.keys())
    for _key in key_list:
      inner_list = [temp_dict[_key]['train_acc'], 
                    temp_dict[_key]['train_precision'], 
                    temp_dict[_key]['train_recall'], 
                    temp_dict[_key]['train_f1'], 
                    temp_dict[_key]['train_roc_auc_value'], 
                    temp_dict[_key]['test_acc'], 
                    temp_dict[_key]['test_precision'], 
                    temp_dict[_key]['test_recall'], 
                    temp_dict[_key]['test_f1'], 
                    temp_dict[_key]['test_roc_auc_value']]
      recombined_dictionary[_key] = inner_list

In [None]:
import pandas as pd

In [None]:
feature_ablation_df = pd.DataFrame.from_dict(recombined_dictionary, orient='index', 
                                             columns=['train_acc', 'train_precision', 'train_recall', 
                                                      'train_f1', 'train_roc_auc_value', 
                                                      'test_acc', 'test_precision', 'test_recall', 
                                                      'test_f1', 'test_roc_auc_value'])
feature_ablation_df.loc['all_columns'] = [0.7086740443237308, 0.7021821858368557, 
                                          0.7233819402250962, 
                                          0.71262443084246, 0.7873256955384904, 
                                          0.6982287844202375, 0.689122548463893, 0.7102588070339428, 
                                          0.6995310567047384, 0.7708407927116426]
feature_ablation_df = feature_ablation_df.sort_values(['test_acc', 'test_f1'], ascending=False)
feature_ablation_df.iloc[114]

train_acc              0.708674
train_precision        0.702182
train_recall           0.723382
train_f1               0.712624
train_roc_auc_value    0.787326
test_acc               0.698229
test_precision         0.689123
test_recall            0.710259
test_f1                0.699531
test_roc_auc_value     0.770841
Name: all_columns, dtype: float64

In [None]:
feature_ablation_df.iloc[:113].index

Index(['norm_.', 'norm_;', 'norm_/', 'norm__', 'Brysbaert_score',
       'non_alphanumeric_1', 'avg_tok_len_1', 'norm_:', 'num_pos_tokens',
       'norm_)',
       ...
       'norm_ü', 'norm_è', 'norm_œ', 'norm_Š', 'norm_•', 'norm_+', 'norm_#',
       'norm_!', 'norm_Ò', 'norm_Ç'],
      dtype='object', length=113)

In [None]:
len(feature_ablation_df.iloc[:113].index)

113

In [None]:
# Generate the base file path name for loading the pickle file
base_path = '/content/drive/My Drive/new_text_data.pkl'
new_text_data = pd.read_pickle(base_path)

In [None]:
def determine_X_feat(df):
  all_features = df.columns.to_list()
  X_feat = []
  for feat in all_features:
    if feat != 'label':
      X_feat.append(feat)
  return X_feat

In [None]:
def determine_engineered_feat(df):
  X_feat = determine_X_feat(df)

  engineered_features = []

  for feat in X_feat:
    if feat != 'original_text':
      if feat != 'lemma_text':
        if feat != 'text':
          if feat!= 'pos_tag_tokens':
            engineered_features.append(feat)
  return engineered_features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

def scikit_column_transformer(text_df = new_text_data, text_type = 'original_text', vector_type = 'Count', scaler='Robust', ngrams_value=1, max_features_value=None, sequence_length=500, 
                              test_size=0.2, random_state=21):
  # Reduce the input dataframe to only include either the original_text or lemma_text columns
  if text_type == 'original_text':
    final_text_df = text_df.drop(columns=['lemma_text'])
    final_text_df = final_text_df.rename(columns={'original_text': 'text'})
  elif text_type == 'lemma_text':
    final_text_df = text_df.drop(columns=['original_text'])
    final_text_df = final_text_df.rename(columns={'lemma_text': 'text'})
  else:
    return 'Incorrect input for text_type argument'

  # Perform the Train-Test Split Based on Input Data
  X_feat = determine_X_feat(final_text_df)
  X_train, X_test, y_train, y_test = train_test_split(final_text_df[X_feat], final_text_df['label'], test_size=test_size, random_state=random_state)
  
  # Select Vectors for text data and POS data
  if vector_type == 'Count':
    text_vector = CountVectorizer(ngram_range=(1, ngrams_value),max_features=max_features_value)
    pos_vector = CountVectorizer(ngram_range=(1, ngrams_value), preprocessor=None, token_pattern=r'[^\s]+', lowercase=False)
  elif vector_type == 'Tfidf':
    text_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), max_features=max_features_value)
    pos_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), token_pattern=r'[^\s]+', lowercase=False)
  elif vector_type == 'Binary':
    text_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value),max_features=max_features_value)
    pos_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value), preprocessor=None, token_pattern=r'[^\s]+', lowercase=False)
  else:
    return 'Incorrect input for vector_type argument'

  # Select the desired scaler based on input string
  dict_of_scalers = {'Robust': RobustScaler(), 'MinMax': MinMaxScaler() , 'Standard': StandardScaler()}
  try:
    selected_feature_scaler = dict_of_scalers[scaler]
  except:
    return 'Incorrect input for scaler argument - must be either Count, MinMax or Standard'
  
  # Use Scikit-Learn Column Transformer to vectorize the text data and the POS data, and transform the additional features by selected scaler
  column_trans = ColumnTransformer([('vector_text', text_vector, 'text'), 
                                    ('vector_pos_tags', pos_vector, 'pos_tag_tokens')], 
                                   remainder = selected_feature_scaler)
  
  # Perform Fit_Transform on X_train and transform on X_test
  X_train_matrix = column_trans.fit_transform(X_train)
  X_test_matrix = column_trans.transform(X_test)

  return column_trans, X_train_matrix, y_train, X_test_matrix, y_test

In [None]:
from sklearn import metrics

def obtain_comparison_metrics(y_true, y_pred):
    calc_accuracy = metrics.accuracy_score(y_true, y_pred)
    calc_precision = metrics.precision_score(y_true, y_pred)
    calc_recall = metrics.recall_score(y_true, y_pred)
    calc_f1 = metrics.f1_score(y_true, y_pred)
    
    return calc_accuracy, calc_precision, calc_recall, calc_f1

def obtain_train_and_test_metrics(y_train_true, y_train_pred, y_test_true, y_test_pred):
    (calc_train_accuracy, calc_train_precision, 
     calc_train_recall, calc_train_f1) = obtain_comparison_metrics(y_train_true, y_train_pred)
    (calc_test_accuracy, calc_test_precision, 
     calc_test_recall, calc_test_f1) = obtain_comparison_metrics(y_test_true, y_test_pred)
    output_dict = {'train_acc': calc_train_accuracy, 
                   'train_precision': calc_train_precision, 
                   'train_recall': calc_train_recall, 
                   'train_f1': calc_train_f1, 
                   'test_acc': calc_test_accuracy, 
                   'test_precision': calc_test_precision, 
                   'test_recall': calc_test_recall, 
                   'test_f1': calc_test_f1}
    return output_dict

def obtain_roc_auc_score(clf, X, y_true):
    y_score = clf.predict_proba(X)[:, 1]
    roc_auc_value = metrics.roc_auc_score(y_true.values, y_score)
    return roc_auc_value

In [None]:
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier
import time

In [None]:
features_to_be_ablated = feature_ablation_df.iloc[:113].index.to_list()
new_text_data = new_text_data.drop(columns=features_to_be_ablated)

In [None]:
def info_gathering_repeat(input_text_df):
  tfidf_trans, X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf = scikit_column_transformer(text_df = input_text_df, text_type = 'original_text', vector_type = 'Tfidf')
  clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
  clf.fit(X_train_tfidf, y_train_tfidf)
  train_preds = clf.predict(X_train_tfidf)
  test_preds = clf.predict(X_test_tfidf)
  output_dict = obtain_train_and_test_metrics(y_train_tfidf, train_preds, y_test_tfidf, test_preds)
  output_dict['train_roc_auc_value'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
  output_dict['test_roc_auc_value'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
  output_dict['columns_in_data'] = input_text_df.columns.to_list()
  output_dict['feature_importances'] = clf.feature_importances_
  return output_dict

In [None]:
n_estimators_val = 100
learning_rate_val = 1
engineered_features = determine_engineered_feat(new_text_data)

In [None]:
info_gathering_repeat(new_text_data)

{'train_acc': 0.7085981970007802,
 'train_precision': 0.7011101562882075,
 'train_recall': 0.7258629169921592,
 'train_f1': 0.7132718517333901,
 'test_acc': 0.6976075587274986,
 'test_precision': 0.6873292590193483,
 'test_recall': 0.7128877723900217,
 'test_f1': 0.6998752527207812,
 'train_roc_auc_value': 0.7861819293888062,
 'test_roc_auc_value': 0.7695065107155004,
 'columns_in_data': ['original_text',
  'label',
  'closed_parentheses',
  'AoA_score',
  'd_c_norm_1',
  'num_toks_1',
  'max_tok_len_1',
  'num_char_norm_1',
  'num_non_ws_char',
  'norm_í',
  'norm_(',
  'norm_=',
  "norm_'",
  'norm_,',
  'norm_-',
  'pos_tag_tokens',
  'lemma_text'],
 'feature_importances': array([0.00000000e+00, 6.16293000e-05, 0.00000000e+00, ...,
        5.22771301e-03, 5.81694491e-03, 3.05458334e-03])}

In [None]:
feature_ablation_df

Unnamed: 0,train_acc,train_precision,train_recall,train_f1,train_roc_auc_value,test_acc,test_precision,test_recall,test_f1,test_roc_auc_value
norm_.,0.708652,0.703251,0.720597,0.711819,0.786238,0.699948,0.691684,0.709645,0.700549,0.770535
norm_;,0.710870,0.703861,0.726731,0.715113,0.788354,0.699587,0.689755,0.713530,0.701442,0.772927
norm_/,0.709349,0.703328,0.722818,0.712940,0.787793,0.699428,0.690954,0.709704,0.700203,0.771891
norm__,0.709985,0.703335,0.725002,0.714004,0.788093,0.699327,0.690167,0.711456,0.700650,0.771641
Brysbaert_score,0.709750,0.703046,0.724923,0.713817,0.787536,0.699255,0.690113,0.711340,0.700565,0.771178
...,...,...,...,...,...,...,...,...,...,...
"norm_,",0.709859,0.704179,0.722434,0.713190,0.786901,0.697059,0.688536,0.707542,0.697910,0.770490
num_char_norm_1,0.709353,0.701602,0.727230,0.714186,0.787255,0.696943,0.687103,0.711047,0.698870,0.770077
norm_=,0.709342,0.701974,0.726239,0.713900,0.786468,0.696582,0.686387,0.711690,0.698810,0.769545
norm_-,0.709010,0.703496,0.721219,0.712247,0.786917,0.696538,0.687853,0.707484,0.697530,0.770273


In [None]:
new_text_data = pd.read_pickle(base_path)
new_text_data = new_text_data.drop(columns=['norm_.', 'norm_;'])
test = info_gathering_repeat(new_text_data)
test

{'train_acc': 0.7092447051345026,
 'train_precision': 0.702458246273366,
 'train_recall': 0.7246622110349218,
 'train_f1': 0.7133874975967872,
 'test_acc': 0.698618856366841,
 'test_precision': 0.6881877796853452,
 'test_recall': 0.7142606765204182,
 'test_f1': 0.7009818676987026,
 'train_roc_auc_value': 0.7867076714681589,
 'test_roc_auc_value': 0.7700709793592988,
 'columns_in_data': ['original_text',
  'label',
  'closed_parentheses',
  'AoA_score',
  'Brysbaert_score',
  'd_c_norm_1',
  'num_toks_1',
  'avg_tok_len_1',
  'max_tok_len_1',
  'num_char_norm_1',
  'non_alphanumeric_1',
  'num_non_ws_char',
  'norm_<',
  'norm_ß',
  'norm_Ð',
  'norm_º',
  'norm_ù',
  'norm_™',
  'norm_Ò',
  'norm_Ç',
  'norm_ë',
  'norm_å',
  'norm_ø',
  'norm_>',
  'norm_Ö',
  'norm_ž',
  'norm_Õ',
  'norm_Í',
  'norm_ï',
  'norm_Ë',
  'norm_Ã',
  'norm_â',
  'norm_Ì',
  'norm_í',
  'norm_Ô',
  'norm_þ',
  'norm_§',
  'norm_¹',
  'norm_Þ',
  'norm_¨',
  'norm_µ',
  'norm_´',
  'norm_(',
  'norm_á',
  

In [None]:
new_text_data = pd.read_pickle(base_path)
new_text_data = new_text_data.drop(columns=['norm_.'])
test = info_gathering_repeat(new_text_data)
test

{'train_acc': 0.7086523736600306,
 'train_precision': 0.7032514012226285,
 'train_recall': 0.7205971703845152,
 'train_f1': 0.7118186300077881,
 'test_acc': 0.6999479904071195,
 'test_precision': 0.6916835122284543,
 'test_recall': 0.7096453817841911,
 'test_f1': 0.7005493317184999,
 'train_roc_auc_value': 0.7862380046026203,
 'test_roc_auc_value': 0.770534602780415,
 'columns_in_data': ['original_text',
  'label',
  'closed_parentheses',
  'AoA_score',
  'Brysbaert_score',
  'd_c_norm_1',
  'num_toks_1',
  'avg_tok_len_1',
  'max_tok_len_1',
  'num_char_norm_1',
  'non_alphanumeric_1',
  'num_non_ws_char',
  'norm_<',
  'norm_ß',
  'norm_Ð',
  'norm_º',
  'norm_ù',
  'norm_™',
  'norm_Ò',
  'norm_Ç',
  'norm_ë',
  'norm_å',
  'norm_ø',
  'norm_>',
  'norm_Ö',
  'norm_ž',
  'norm_Õ',
  'norm_Í',
  'norm_ï',
  'norm_Ë',
  'norm_Ã',
  'norm_â',
  'norm_Ì',
  'norm_í',
  'norm_Ô',
  'norm_þ',
  'norm_§',
  'norm_¹',
  'norm_Þ',
  'norm_¨',
  'norm_µ',
  'norm_´',
  'norm_(',
  'norm_á',
 

In [None]:
feature_ablation_df.to_csv('/content/drive/My Drive/feature_ablation_df.csv')