In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Generate the base file path name for loading the pickle file
base_path = '/content/drive/My Drive/new_text_data.pkl'

Mounted at /content/drive


In [2]:
# Import Necessary Modules for Data Preprocessing

# Used for loading in training data [Function #1 - load_raw_training_data()]
import pandas as pd
# Adjust column width settings to see all of the 'original_text' column
pd.set_option('max_colwidth', 400)

# Used for replacing '-LRB-' and '-RRB-' with left and right parentheses in original text repectively [Function #2 - replace_LRB_and_RRB()]
import re

# Used for label value changing in preprocessing training data [Function #6 - preprocessing_training_data()]
import numpy as np

# Used for tokenization when creating score values against extraneous resourses [Function #8 - extraneous_score_calculation()]
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 

# Used for Parts-of-Speech tagging [Function #14 - POS_preprocessing()]
from nltk.tag import pos_tag

# Used for Lemmatization [Function #15 - lemma_preprocessing()]
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Used for vectorization [Function # ]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
def determine_X_feat(df):
  all_features = df.columns.to_list()
  X_feat = []
  for feat in all_features:
    if feat != 'label':
      X_feat.append(feat)
  return X_feat

In [4]:
def determine_engineered_feat(df):
  X_feat = determine_X_feat(df)

  engineered_features = []

  for feat in X_feat:
    if feat != 'original_text':
      if feat != 'lemma_text':
        if feat != 'text':
          if feat!= 'pos_tag_tokens':
            engineered_features.append(feat)
  return engineered_features

In [5]:
new_text_data = pd.read_pickle(base_path)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

def scikit_column_transformer(text_df = new_text_data, text_type = 'original_text', vector_type = 'Count', scaler='Robust', ngrams_value=1, max_features_value=None, sequence_length=500, 
                              test_size=0.2, random_state=21):
  # Reduce the input dataframe to only include either the original_text or lemma_text columns
  if text_type == 'original_text':
    final_text_df = text_df.drop(columns=['lemma_text'])
    final_text_df = final_text_df.rename(columns={'original_text': 'text'})
  elif text_type == 'lemma_text':
    final_text_df = text_df.drop(columns=['original_text'])
    final_text_df = final_text_df.rename(columns={'lemma_text': 'text'})
  else:
    return 'Incorrect input for text_type argument'

  # Perform the Train-Test Split Based on Input Data
  X_feat = determine_X_feat(final_text_df)
  X_train, X_test, y_train, y_test = train_test_split(final_text_df[X_feat], final_text_df['label'], test_size=test_size, random_state=random_state)
  
  # Select Vectors for text data and POS data
  if vector_type == 'Count':
    text_vector = CountVectorizer(ngram_range=(1, ngrams_value),max_features=max_features_value)
    pos_vector = CountVectorizer(ngram_range=(1, ngrams_value), preprocessor=None, token_pattern=r'[^\s]+', lowercase=False)
  elif vector_type == 'Tfidf':
    text_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), max_features=max_features_value)
    pos_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), token_pattern=r'[^\s]+', lowercase=False)
  elif vector_type == 'Binary':
    text_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value),max_features=max_features_value)
    pos_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value), preprocessor=None, token_pattern=r'[^\s]+', lowercase=False)
  else:
    return 'Incorrect input for vector_type argument'

  # Select the desired scaler based on input string
  dict_of_scalers = {'Robust': RobustScaler(), 'MinMax': MinMaxScaler() , 'Standard': StandardScaler()}
  try:
    selected_feature_scaler = dict_of_scalers[scaler]
  except:
    return 'Incorrect input for scaler argument - must be either Count, MinMax or Standard'
  
  # Use Scikit-Learn Column Transformer to vectorize the text data and the POS data, and transform the additional features by selected scaler
  column_trans = ColumnTransformer([('vector_text', text_vector, 'text'), 
                                    ('vector_pos_tags', pos_vector, 'pos_tag_tokens')], 
                                   remainder = selected_feature_scaler)
  
  # Perform Fit_Transform on X_train and transform on X_test
  X_train_matrix = column_trans.fit_transform(X_train)
  X_test_matrix = column_trans.transform(X_test)

  return column_trans, X_train_matrix, y_train, X_test_matrix, y_test

In [7]:
from sklearn import metrics

def obtain_comparison_metrics(y_true, y_pred):
    calc_accuracy = metrics.accuracy_score(y_true, y_pred)
    calc_precision = metrics.precision_score(y_true, y_pred)
    calc_recall = metrics.recall_score(y_true, y_pred)
    calc_f1 = metrics.f1_score(y_true, y_pred)
    
    return calc_accuracy, calc_precision, calc_recall, calc_f1

def obtain_train_and_test_metrics(y_train_true, y_train_pred, y_test_true, y_test_pred):
    (calc_train_accuracy, calc_train_precision, 
     calc_train_recall, calc_train_f1) = obtain_comparison_metrics(y_train_true, y_train_pred)
    (calc_test_accuracy, calc_test_precision, 
     calc_test_recall, calc_test_f1) = obtain_comparison_metrics(y_test_true, y_test_pred)
    output_dict = {'train_acc': calc_train_accuracy, 
                   'train_precision': calc_train_precision, 
                   'train_recall': calc_train_recall, 
                   'train_f1': calc_train_f1, 
                   'test_acc': calc_test_accuracy, 
                   'test_precision': calc_test_precision, 
                   'test_recall': calc_test_recall, 
                   'test_f1': calc_test_f1}
    return output_dict

def obtain_roc_auc_score(clf, X, y_true):
    y_score = clf.predict_proba(X)[:, 1]
    roc_auc_value = metrics.roc_auc_score(y_true.values, y_score)
    return roc_auc_value

In [8]:
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier
import time

In [9]:
n_estimators_val = 100
learning_rate_val = 1
engineered_features = determine_engineered_feat(new_text_data)

In [10]:
def info_gathering_repeat(input_text_df):
  tfidf_trans, X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf = scikit_column_transformer(text_df = input_text_df, text_type = 'original_text', vector_type = 'Tfidf')
  clf = GradientBoostingClassifier(n_estimators=n_estimators_val, learning_rate=learning_rate_val, random_state=0)
  clf.fit(X_train_tfidf, y_train_tfidf)
  train_preds = clf.predict(X_train_tfidf)
  test_preds = clf.predict(X_test_tfidf)
  output_dict = obtain_train_and_test_metrics(y_train_tfidf, train_preds, y_test_tfidf, test_preds)
  output_dict['train_roc_auc_value'] = obtain_roc_auc_score(clf, X_train_tfidf, y_train_tfidf)
  output_dict['test_roc_auc_value'] = obtain_roc_auc_score(clf, X_test_tfidf, y_test_tfidf)
  output_dict['columns_in_data'] = input_text_df.columns.to_list()
  output_dict['feature_importances'] = clf.feature_importances_
  return output_dict

In [11]:
start_time = time.time()
overall_dict = {}

for col in engineered_features[103:]:
  temp_data = new_text_data.copy().drop(columns=[col])
  info_output_dict = info_gathering_repeat(temp_data)
  overall_dict[col] = info_output_dict

end_time = time.time()
delta_time = end_time - start_time
print("total run time: {} mins".format(delta_time/60))

total run time: 215.40765579541525 mins


In [12]:
from google.colab import drive
drive.mount('/content/drive')

import pickle

# Generate the base file path name for loading the pickle file
base_path = '/content/drive/My Drive/overall_dict_25Feb2023_Part_1.pkl'


with open(base_path, 'wb') as handle:
  pickle.dump(overall_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(base_path, 'rb') as handle:
    b = pickle.load(handle)

print(b)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'norm_ç': {'train_acc': 0.7086740443237308, 'train_precision': 0.7021821858368557, 'train_recall': 0.7233819402250962, 'train_f1': 0.71262443084246, 'test_acc': 0.6982576786385044, 'test_precision': 0.6891508899217775, 'test_recall': 0.7102880177601215, 'test_f1': 0.699559826232055, 'train_roc_auc_value': 0.7873256955384904, 'test_roc_auc_value': 0.7708949223982625, 'columns_in_data': ['original_text', 'label', 'closed_parentheses', 'AoA_score', 'Brysbaert_score', 'd_c_norm_1', 'num_toks_1', 'avg_tok_len_1', 'max_tok_len_1', 'num_char_norm_1', 'non_alphanumeric_1', 'num_non_ws_char', 'norm_<', 'norm_ß', 'norm_Ð', 'norm_º', 'norm_ù', 'norm_™', 'norm_Ò', 'norm_Ç', 'norm_ë', 'norm_å', 'norm_ø', 'norm_>', 'norm_Ö', 'norm_ž', 'norm_Õ', 'norm_Í', 'norm_ï', 'norm_Ë', 'norm_Ã', 'norm_â', 'norm_Ì', 'norm_í', 'norm_Ô', 'norm_þ', 'norm_§', 'norm_¹', 'norm_Þ', 'norm_¨',