In [1]:
!pip install altair
!pip install altair
!pip install nltk
!pip install tensorflow

import numpy as np
import pandas as pd
import altair as alt
from tqdm import tqdm
import time
import re
from scipy.sparse import hstack

import nltk
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import tensorflow as tf
import keras
from keras import layers
from keras.layers import TextVectorization

Collecting altair
  Downloading altair-4.2.2-py3-none-any.whl (813 kB)
[K     |████████████████████████████████| 813 kB 9.0 MB/s eta 0:00:01
Collecting toolz
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 12.0 MB/s  eta 0:00:01
Installing collected packages: toolz, altair
Successfully installed altair-4.2.2 toolz-0.12.0
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 7.7 MB/s eta 0:00:01
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
[K     |████████████████████████████████| 772 kB 126.6 MB/s eta 0:00:01
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2022.10.31
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 15 kB/s /s eta 0:00:

2023-02-27 17:47:34.407003: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-27 17:47:34.523142: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/oracle/instantclient_12_1:/usr/lib/jvm/java-11-openjdk-amd64/jre/lib/amd64/server:
2023-02-27 17:47:34.523175: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-27 17:47:35.124492: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic li

In [2]:
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Data Preparation

# Binary/ Count/ TFIDF Vectorization

## 1) Import New Dataset

In [3]:
# Load data from CSV file
new_data = pd.read_csv('../results/new_text_data.csv')

# Drop all rows that contain Null Entries in 'lemma_text' column
new_data = new_data.dropna(subset=['lemma_text'])

## 2) Binary/Count/TFIDF Vectorization

In [4]:
# Function to identify all columns that are not 'label' in a DataFrame
def determine_X_feat(df):
    all_features = df.columns.to_list()
    X_feat = []
    for feat in all_features:
        if feat != 'label':
            X_feat.append(feat)
    return X_feat

In [5]:
def scikit_column_transformer(text_df = new_data, text_type = 'original_text', vector_type = 'Count',
                              scaler='Robust', ngrams_value=1, max_features_value=None, sequence_length=500, 
                              test_size=0.2, random_state=21):
  
    # Reduce the input dataframe to only include either the original_text or lemma_text columns
    if text_type == 'original_text':
        final_text_df = text_df.drop(columns=['lemma_text'])
        final_text_df = final_text_df.rename(columns={'original_text': 'text'})
        
    elif text_type == 'lemma_text':
        final_text_df = text_df.drop(columns=['original_text'])
        final_text_df = final_text_df.rename(columns={'lemma_text': 'text'})
        
    else:
        return 'Incorrect input for text_type argument'


    # Perform the Train-Test Split Based on Input Data
    X_feat = determine_X_feat(final_text_df)
    X_train, X_test, y_train, y_test = train_test_split(final_text_df[X_feat], final_text_df['label'],
                                                      test_size=test_size, random_state=random_state)


    # Select Vectors for text data and POS data
    if vector_type == 'Count':
        text_vector = CountVectorizer(ngram_range=(1, ngrams_value),max_features=max_features_value)
        pos_vector = CountVectorizer(ngram_range=(1, ngrams_value), preprocessor=None,
                                     token_pattern=r'[^\s]+', lowercase=False)
    elif vector_type == 'Tfidf':
        text_vector = TfidfVectorizer(ngram_range=(1, ngrams_value), max_features=max_features_value)
        pos_vector = TfidfVectorizer(ngram_range=(1, ngrams_value),
                                     token_pattern=r'[^\s]+', lowercase=False)
    elif vector_type == 'Binary':
        text_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value),max_features=max_features_value)
        pos_vector = CountVectorizer(binary=True, ngram_range=(1, ngrams_value), preprocessor=None,
                                     token_pattern=r'[^\s]+', lowercase=False)
    elif vector_type == 'Indice':
        text_vector = TextVectorization(output_mode="int", max_tokens=max_features_value,
                                   output_sequence_length=sequence_length, ngrams=(1, ngrams_value))
    else:
        return 'Incorrect input for vector_type argument'


    # Select the desired scaler based on input string
    dict_of_scalers = {'Robust': RobustScaler(), 'MinMax': MinMaxScaler(),
                       'Standard': StandardScaler(), 'drop': 'drop'}
    
    try:
        selected_feature_scaler = dict_of_scalers[scaler]
    except:
        return 'Incorrect input for scaler argument - must be either Count, MinMax, Standard or drop'


    # Use Scikit-Learn Column Transformer to vectorize the text data and the POS data,
    # and transform the additional features by selected scaler
    if vector_type != 'Indice':
        column_trans = ColumnTransformer([('vector_text', text_vector, 'text'), 
                                        ('vector_pos_tags', pos_vector, 'pos_tag_tokens')], 
                                      remainder = selected_feature_scaler)

        # Perform Fit_Transform on X_train and transform on X_test  
        X_train_matrix = column_trans.fit_transform(X_train)
        X_test_matrix = column_trans.transform(X_test)

    else:
        column_trans = None

        # Perform Fit_Transform on X_train and transform on X_test
        text_vector.adapt(X_train['text'])
        X_train_matrix = text_vector(X_train['text'])
        X_test_matrix = text_vector(X_test['text'])

    return column_trans, X_train_matrix, y_train, X_test_matrix, y_test

In [6]:
# Generate Vectorized and Scaled DataFrames for Supervised Learning
tfidf_trans, X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf =\
    scikit_column_transformer(text_df=new_data, text_type='lemma_text',max_features_value=40000,
                              vector_type='Tfidf', scaler='Robust')

binary_trans, X_train_bin, y_train_bin, X_test_bin, y_test_bin =\
    scikit_column_transformer(text_df=new_data, text_type='lemma_text', max_features_value=40000,
                              vector_type='Binary', scaler='Robust')

count_trans, X_train_count, y_train_count, X_test_count, y_test_count =\
    scikit_column_transformer(text_df=new_data, text_type='lemma_text', max_features_value=40000,
                              vector_type='Count', scaler='MinMax')

# Indice Vectorization

## 1) Import original dataset

In [7]:
old_data_1 = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/WikiLarge_Train_part_1.csv')
old_data_2 = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/WikiLarge_Train_part_2.csv')
old_data_3 = pd.read_csv('https://raw.githubusercontent.com/nruloff/Difficulty_Classification_of_Textual_Passages/main/Data/WikiLarge_Train_part_3.csv')
old_data = pd.concat([old_data_1, old_data_2, old_data_3], ignore_index=True)

X_train, X_test, y_train_indice, y_test_indice = \
    train_test_split(old_data['original_text'], old_data['label'], test_size=0.2, random_state=21)

## 2) Indice Vectorization

In [8]:
# refer https://towardsdatascience.com/predicting-the-difficulty-of-texts-using-machine-learning-and-getting-a-visual-representation-of-75f5a96b92e5

def preprocessing(text_series):
    text_list = []
    lemmatizer = WordNetLemmatizer()
    
    for text in tqdm(text_series.values):
        text = re.sub('[^a-zA-Z]|LRB|RRB', ' ', text)
        text = text.lower()
        text = nltk.word_tokenize(text)
        #text = [word for word in text if not word in set(stopwords.words("english"))]
        text = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(text)
        
        text_list.append(text)
    
    return text_list

In [9]:
def indVec(trainText_series, testText_series, ngrams=1, max_features=40000, sequence_length=500):
    trainTextPreprocessed = preprocessing(trainText_series)
    testTextPreprocessed = preprocessing(testText_series)
    
    vector = TextVectorization(output_mode="int", max_tokens=max_features,
                               output_sequence_length=sequence_length, ngrams=(1, ngrams))
    vector.adapt(trainTextPreprocessed)
    trainText_matrix = vector(trainTextPreprocessed)
    testText_matrix = vector(testTextPreprocessed)
    
    return (trainText_matrix, testText_matrix)

In [10]:
X_train_indice, X_test_indice = indVec(X_train, X_test)

100%|██████████| 333414/333414 [00:53<00:00, 6190.30it/s]
100%|██████████| 83354/83354 [00:13<00:00, 6364.83it/s]
2023-02-27 17:49:25.579633: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/oracle/instantclient_12_1:/usr/lib/jvm/java-11-openjdk-amd64/jre/lib/amd64/server:
2023-02-27 17:49:25.579677: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-27 17:49:25.579717: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (run-63fceaf10fc863061e25bcc6-v9z6f): /proc/driver/nvidia/version does not exist
2023-02-27 17:49:25.579999: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to us

# Supervised Learning

## 1) Naive Bayes

### a) Bernoulli

In [11]:
start_time = time.time()

# train Bernoulli model
# optimize the hyperparameters
res = pd.DataFrame(columns=['alpha', 'trainScore', 'testScore'])

for alpha in tqdm([0.01, 0.1, 1, 10]):
    clf = BernoulliNB(alpha=alpha)
        
    clf.fit(X_train_bin, y_train_bin)
    trainScore = clf.score(X_train_bin, y_train_bin)
    testScore = clf.score(X_test_bin, y_test_bin)

    res = res.append({'alpha':alpha, 'trainScore':trainScore, 'testScore':testScore},
                    ignore_index=True)

# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

100%|██████████| 4/4 [00:01<00:00,  2.18it/s]



total run time: 0.030609321594238282 mins





In [12]:
# plot the hyperparameters search result

resLong = res.melt(id_vars=['alpha'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

Lineplot = alt.Chart(resLong).mark_line().encode(
    x = 'alpha:N',
    y = alt.Y('score', scale=alt.Scale(domain=[0.5,1])),
    color = 'group',
    tooltip = 'score'
).properties(width=100, height=100)

Lineplot

In [13]:
# generate report result
ber = BernoulliNB(alpha=1)
ber_re = cross_validate(clf, X_train_bin, y_train_bin, cv=5, scoring=['accuracy', 'recall'])

In [14]:
pd.DataFrame(ber_re).describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall
count,5.0,5.0,5.0,5.0
mean,0.201058,0.054039,0.634084,0.605773
std,0.000936,0.001264,0.00208,0.002484
min,0.199986,0.053243,0.630864,0.602244
25%,0.200522,0.053445,0.633086,0.60409
50%,0.200917,0.053516,0.635381,0.60708
75%,0.201421,0.053713,0.635508,0.607564
max,0.202443,0.05628,0.63558,0.60789


### b) Multinominal

In [15]:
start_time = time.time()

# train Multinominal model
res = pd.DataFrame(columns=['alpha', 'trainScore', 'testScore'])

for alpha in tqdm([0.1, 1, 10, 100]):
    clf = MultinomialNB(alpha=alpha)

    clf.fit(X_train_count, y_train_count)
    trainScore = clf.score(X_train_count, y_train_count)
    testScore = clf.score(X_test_count, y_test_count)

    res = res.append({'alpha':alpha, 'trainScore':trainScore, 'testScore':testScore},
                    ignore_index=True)

# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

100%|██████████| 4/4 [00:00<00:00,  5.94it/s]



total run time: 0.011260414123535156 mins





In [16]:
resLong = res.melt(id_vars=['alpha'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

Lineplot = alt.Chart(resLong).mark_line().encode(
    x = 'alpha:N',
    y = alt.Y('score', scale=alt.Scale(domain=[0.5,1])),
    color = 'group',
    tooltip = 'score'
).properties(width=100, height=100)

Lineplot

In [17]:
# generate report result
mul = MultinomialNB(alpha=10)
mul_re = cross_validate(mul, X_train_count, y_train_count, cv=5, scoring=['accuracy', 'recall'])

In [18]:
pd.DataFrame(mul_re).describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall
count,5.0,5.0,5.0,5.0
mean,0.10082,0.029361,0.656147,0.696581
std,0.00036,0.000263,0.002467,0.00218
min,0.100377,0.029025,0.652891,0.694282
25%,0.100632,0.029227,0.65524,0.695765
50%,0.100737,0.0293,0.655764,0.696164
75%,0.101062,0.02961,0.657318,0.696526
max,0.101292,0.029646,0.659523,0.70017


## 2) Logistic Regression

In [19]:
start_time = time.time()

# train logistic regression model
res = pd.DataFrame(columns=['C', 'trainScore', 'testScore'])

for C in tqdm([0.01, 0.1, 1, 10]):
    clf = LogisticRegression(C=C, solver='lbfgs', n_jobs=-1,
                              random_state=0, max_iter=1000)
    
    clf.fit(X_train_tfidf, y_train_tfidf)
    trainScore = clf.score(X_train_tfidf, y_train_tfidf)
    testScore = clf.score(X_test_tfidf, y_test_tfidf)

    res = res.append({'C':C, 'trainScore':trainScore, 'testScore':testScore},
                ignore_index=True)

# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

100%|██████████| 4/4 [02:36<00:00, 39.19s/it]



total run time: 2.6129480679829915 mins





In [20]:
resLong = res.melt(id_vars=['C'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

Lineplot = alt.Chart(resLong).mark_line().encode(
    x = 'C:N',
    y = alt.Y('score', scale=alt.Scale(domain=[0.5,1])),
    color = 'group',
    tooltip = 'score'
).properties(width=100, height=100)

Lineplot

In [21]:
# generate report result
logRe = LogisticRegression(C=0.1, solver='lbfgs', max_iter=1500)
logRe_re = cross_validate(logRe, X_train_tfidf, y_train_tfidf, cv=5, scoring=['accuracy', 'recall'])

In [22]:
pd.DataFrame(logRe_re).describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall
count,5.0,5.0,5.0,5.0
mean,29.887921,0.03875,0.685963,0.688496
std,2.443775,0.003256,0.002444,0.004504
min,25.727132,0.036759,0.682364,0.68089
25%,29.876733,0.036906,0.68493,0.688237
50%,30.917706,0.037333,0.686357,0.690156
75%,30.921098,0.038276,0.687477,0.690807
max,31.996933,0.044478,0.688688,0.692388


## 3) Ensembles

### a) Random Forest

In [23]:
start_time = time.time()

# train random forest model
res = pd.DataFrame(columns=['n_estimators', 'max_depth', 'trainScore', 'testScore'])


for n_estimators in tqdm([10, 100, 200]):
    for max_depth in tqdm([5, 50, 100, 250]):
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                     n_jobs=-1, random_state=0)
        
        clf.fit(X_train_tfidf, y_train_tfidf)
        trainScore = clf.score(X_train_tfidf, y_train_tfidf)
        testScore = clf.score(X_test_tfidf, y_test_tfidf)

        res = res.append({'n_estimators':n_estimators, 'max_depth':max_depth,
                          'trainScore':trainScore, 'testScore':testScore},
                          ignore_index=True)

# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:01,  1.93it/s][A
 50%|█████     | 2/4 [00:05<00:06,  3.06s/it][A
 75%|███████▌  | 3/4 [00:21<00:08,  8.95s/it][A
100%|██████████| 4/4 [00:46<00:00, 11.53s/it][A
 33%|███▎      | 1/3 [00:46<01:32, 46.13s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:01<00:03,  1.27s/it][A
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the docu



total run time: 14.16754190524419 mins





In [25]:
Heatmap = alt.Chart(res).mark_rect(stroke='white').encode(
    x = 'n_estimators:N',
    y = 'max_depth:N',
    color = 'testScore',
    tooltip = 'testScore'
).properties(width=100, height=100)


resLong = res.melt(id_vars=['n_estimators', 'max_depth'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

Lineplot = alt.Chart(resLong).mark_line().encode(
    x = 'n_estimators:N',
    y = alt.Y('score', scale=alt.Scale(domain=[0.5,1])),
    color = 'group',
    column = 'max_depth'
).properties(width=100, height=100)

(Heatmap | Lineplot)

In [26]:
# generate report result
randFr = RandomForestClassifier(n_estimators=100, max_depth=50,
                                     n_jobs=-1, random_state=0)
randFr_re = cross_validate(randFr, X_train_tfidf, y_train_tfidf, cv=5, scoring=['accuracy', 'recall'])

In [27]:
pd.DataFrame(randFr_re).describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall
count,5.0,5.0,5.0,5.0
mean,19.031434,0.136392,0.673509,0.712636
std,1.097752,0.003268,0.0026,0.005767
min,18.315338,0.13312,0.669443,0.703004
25%,18.331895,0.133252,0.672588,0.712704
50%,18.730489,0.137272,0.674286,0.713355
75%,18.828076,0.137412,0.675298,0.716504
max,20.951374,0.140903,0.675931,0.717616


### b) Gradient Boosting

In [28]:
start_time = time.time()

# train Gradient Boosting model
res = pd.DataFrame(columns=['n_estimators', 'leraning_rate', 'trainScore', 'testScore'])

for n_estimators in tqdm([10, 100, 200]):
    for learning_rate in tqdm([0.01, 0.1, 1, 10]):
        clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                                     random_state=0)
        
        clf.fit(X_train_tfidf, y_train_tfidf)
        trainScore = clf.score(X_train_tfidf, y_train_tfidf)
        testScore = clf.score(X_test_tfidf, y_test_tfidf)

        res = res.append({'n_estimators':n_estimators, 'learning_rate':learning_rate,
                          'trainScore':trainScore, 'testScore':testScore},
                          ignore_index=True)

# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:34<01:43, 34.67s/it][A
 50%|█████     | 2/4 [01:09<01:09, 34.78s/it][A
 75%|███████▌  | 3/4 [01:44<00:34, 34.72s/it][A
100%|██████████| 4/4 [02:18<00:00, 34.74s/it][A
 33%|███▎      | 1/3 [02:18<04:37, 138.95s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [05:46<17:18, 346.10s/it][A
 50%|█████     | 2/4 [11:30<11:30, 345.22s/it][A
 75%|███████▌  | 3/4 [17:13<05:43, 343.93s/it][A
100%|██████████| 4/4 [22:57<00:00, 344.50s/it][A
 67%|██████▋   | 2/3 [25:16<14:27, 867.80s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [11:33<34:40, 693.50s/it][A
 50%|█████     | 2/4 [23:00<22:59, 689.82s/it][A
 75%|███████▌  | 3/4 [34:23<11:26, 686.62s/it][A
100%|██████████| 4/4 [45:52<00:00, 688.23s/it][A
100%|██████████| 3/3 [1:11:09<00:00, 1423.29s/it]



total run time: 71.16457213163376 mins





In [29]:
Heatmap = alt.Chart(res).mark_rect(stroke='white').encode(
    x = 'n_estimators:N',
    y = 'learning_rate:N',
    color = 'testScore',
    tooltip = 'testScore'
).properties(width=100, height=100)


resLong = res.melt(id_vars=['n_estimators', 'learning_rate'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

Lineplot = alt.Chart(resLong).mark_line().encode(
    x = 'n_estimators:N',
    y = alt.Y('score', scale=alt.Scale(domain=[0.5,1])),
    color = 'group',
    column = 'learning_rate'
).properties(width=100, height=100)

(Heatmap | Lineplot)

In [30]:
# generate report result
gradB = GradientBoostingClassifier(n_estimators=100, learning_rate=1,
                                     random_state=0)
gradB_re = cross_validate(gradB, X_train_tfidf, y_train_tfidf, cv=5, scoring=['accuracy', 'recall'])

In [31]:
pd.DataFrame(gradB_re).describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_recall
count,5.0,5.0,5.0,5.0
mean,274.188533,0.110713,0.695201,0.712303
std,0.456321,0.002288,0.001687,0.002094
min,273.838341,0.108605,0.692609,0.709917
25%,273.865618,0.108909,0.69485,0.711509
50%,274.050493,0.11057,0.695175,0.711799
75%,274.234958,0.111163,0.696422,0.712704
max,274.953253,0.11432,0.696946,0.715589


## 4) Deep Learning

### a) FFNN

In [32]:
X_train_tfidf.sort_indices()
X_test_tfidf.sort_indices()

In [33]:
start_time = time.time()

layers1 = [20, 100, 200, 500]
layers2 = [20, 50, 100]
layers3 = [5, 20]
input_shape = X_train_tfidf.shape[1]
ep_size = 3
batch_size = 128

res_FFNN = pd.DataFrame(columns=['layer1', 'layer2', 'layer3', 'epoch', 'trainScore', 'testScore'])

for units_1 in tqdm(layers1):
    for units_2 in tqdm(layers2):
        for units_3 in tqdm(layers3):
            
            inputs = keras.Input(shape=(input_shape, ))
            x = layers.Dense(units_1, activation='relu')(inputs)
            x = layers.Dense(units_2, activation='relu')(x)
            x = layers.Dense(units_3, activation='relu')(x)
            outputs = layers.Dense(1, activation='sigmoid')(x)

            model = keras.Model(inputs=inputs, outputs=outputs)
            model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
            model.fit(x=X_train_tfidf, y=y_train_tfidf.values, validation_data=(X_test_tfidf, y_test_tfidf.values),
                epochs=ep_size, batch_size=batch_size, verbose=1)
            
            history = model.history.history
            
            for ep in np.arange(ep_size):
                res_FFNN = res_FFNN.append({'layer1':units_1, 'layer2':units_2, 'layer3':units_3, 'epoch':ep+1,
                                  'trainScore':history['accuracy'][ep],
                                  'testScore':history['val_accuracy'][ep]}, ignore_index=True)
                
# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [00:29<00:29, 29.21s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [00:58<00:00, 29.20s/it][A[A

 33%|███▎      | 1/3 [00:58<01:56, 58.41s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [00:29<00:29, 29.70s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [00:59<00:00, 29.77s/it][A[A

 67%|██████▋   | 2/3 [01:57<00:59, 59.08s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [00:30<00:30, 30.16s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [01:00<00:00, 30.21s/it][A[A

100%|██████████| 3/3 [02:58<00:00, 59.46s/it][A
 25%|██▌       | 1/4 [02:58<08:55, 178.40s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [01:51<01:51, 111.26s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [03:46<00:00, 113.36s/it][A[A

 33%|███▎      | 1/3 [03:46<07:33, 226.73s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [01:48<01:48, 108.89s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [03:44<00:00, 112.48s/it][A[A

 67%|██████▋   | 2/3 [07:31<03:45, 225.70s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [01:51<01:51, 111.91s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [03:47<00:00, 113.63s/it][A[A

100%|██████████| 3/3 [11:18<00:00, 226.33s/it][A
 50%|█████     | 2/4 [14:17<15:45, 472.87s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [03:38<03:38, 218.51s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [07:10<00:00, 215.21s/it][A[A

 33%|███▎      | 1/3 [07:10<14:20, 430.43s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [03:25<03:25, 205.49s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [07:01<00:00, 210.62s/it][A[A

 67%|██████▋   | 2/3 [14:11<07:05, 425.02s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [03:25<03:25, 205.42s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [06:46<00:00, 203.06s/it][A[A

100%|██████████| 3/3 [20:57<00:00, 419.26s/it][A
 75%|███████▌  | 3/4 [35:15<13:51, 831.29s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [20:15<20:15, 1215.85s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [40:35<00:00, 1217.71s/it][A[A

 33%|███▎      | 1/3 [40:35<1:21:10, 2435.42s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [20:19<20:19, 1219.65s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [40:43<00:00, 1221.95s/it][A[A

 67%|██████▋   | 2/3 [1:21:19<40:40, 2440.41s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




 50%|█████     | 1/2 [20:26<20:26, 1226.52s/it][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 2/2 [40:51<00:00, 1225.56s/it][A[A

100%|██████████| 3/3 [2:02:10<00:00, 2443.49s/it][A
100%|██████████| 4/4 [2:37:25<00:00, 2361.42s/it]



total run time: 157.42807729641598 mins





In [34]:
resLong = res_FFNN.melt(id_vars=['layer1', 'layer2', 'layer3', 'epoch'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

layer1_20 = alt.Chart(resLong[resLong.layer1==20]).mark_line().encode(
    row = 'layer2',
    column = 'layer3',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.6,0.8])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

layer1_100 = alt.Chart(resLong[resLong.layer1==100]).mark_line().encode(
    row = 'layer2',
    column = 'layer3',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.6,0.8])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

layer1_200 = alt.Chart(resLong[resLong.layer1==200]).mark_line().encode(
    row = 'layer2',
    column = 'layer3',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.6,0.8])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

layer1_500 = alt.Chart(resLong[resLong.layer1==500]).mark_line().encode(
    row = 'layer2',
    column = 'layer3',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.6,0.8])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

layer1_20 | layer1_100 | layer1_200 | layer1_500

In [35]:
# generate report result

res_FFNN_report = pd.DataFrame(columns=['rand', 'trainScore_accuracy', 'trainScore_recall',
                           'testScore_accuracy', 'testScore_recall'])

for rand in [10, 20, 30, 40, 50]:
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=rand)
    
    input_shape = X_train_.shape[1]
    
    inputs = keras.Input(shape=(input_shape, ))
    x = layers.Dense(20, activation='relu')(inputs)
    x = layers.Dense(50, activation='relu')(x)
    x = layers.Dense(20, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', metrics=['accuracy', 'Recall'], optimizer='adam')
    model.fit(x=X_train_, y=y_train_.values, validation_data=(X_test_, y_test_.values),
        epochs=2, batch_size=128, verbose=1)

    history = model.history.history

    res_FFNN_report = res_FFNN_report.append({'rand':rand, 'trainScore_accuracy':history['accuracy'][1],
                      'trainScore_recall':history['recall'][1],
                      'testScore_accuracy':history['val_accuracy'][1],
                      'testScore_recall':history['val_recall'][1],}, ignore_index=True)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [36]:
res.describe()

Unnamed: 0,n_estimators,leraning_rate,trainScore,testScore,learning_rate
count,12.0,0.0,12.0,12.0,12.0
mean,103.333333,,0.637953,0.633609,2.7775
std,81.053667,,0.073196,0.07145,4.374058
min,10.0,,0.522841,0.518786,0.01
25%,10.0,,0.609008,0.608536,0.0775
50%,100.0,,0.65891,0.658937,0.55
75%,200.0,,0.684124,0.681406,3.25
max,200.0,,0.722125,0.699082,10.0


### b) CNN

In [37]:
start_time = time.time()

layers1 = [32, 128]
layers2 = [32, 128]
layers3 = [32, 128]
dropRate = 0.2
convSize = 7
striSize = 3
input_shape = 500
max_features = 40000
embedding_dim = 128
ep_size = 2
batch_size = 1024

res_CNN = pd.DataFrame(columns=['layer1', 'layer2', 'layer3', 'epoch', 'trainScore', 'testScore'])

for units_1 in tqdm(layers1):
    for units_2 in tqdm(layers2):
        for units_3 in tqdm(layers3):
        
            inputs = keras.Input(shape=(input_shape,), dtype="int64")

            x = layers.Embedding(max_features, embedding_dim)(inputs)
            x = layers.Dropout(dropRate)(x)

            x = layers.Conv1D(units_1, convSize, padding="valid", activation="relu", strides=striSize)(x)
            x = layers.Conv1D(units_2, convSize, padding="valid", activation="relu", strides=striSize)(x)
            x = layers.GlobalMaxPooling1D()(x)

            x = layers.Dense(units_3, activation="relu")(x)
            x = layers.Dropout(dropRate)(x)

            outputs = layers.Dense(1, activation="sigmoid")(x)

            model = keras.Model(inputs=inputs, outputs=outputs)
            model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
            model.fit(x=X_train_indice, y=y_train_indice.values, validation_data=(X_test_indice, y_test_indice.values),
                      epochs=ep_size, batch_size=batch_size, verbose=1)

            history = model.history.history

            for ep in np.arange(ep_size):
                res_CNN = res_CNN.append({'layer1':units_1, 'layer2':units_2, 'layer3':units_3, 'epoch':ep+1,
                                  'trainScore':history['accuracy'][ep],
                                  'testScore':history['val_accuracy'][ep]}, ignore_index=True)
                
                
# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/2
Epoch 2/2




 50%|█████     | 1/2 [02:24<02:24, 144.74s/it][A[A

Epoch 1/2
Epoch 2/2




100%|██████████| 2/2 [04:48<00:00, 144.09s/it][A[A

 50%|█████     | 1/2 [04:48<04:48, 288.18s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/2
Epoch 2/2




 50%|█████     | 1/2 [02:28<02:28, 148.88s/it][A[A

Epoch 1/2
Epoch 2/2




100%|██████████| 2/2 [04:56<00:00, 148.41s/it][A[A

100%|██████████| 2/2 [09:45<00:00, 292.50s/it][A
 50%|█████     | 1/2 [09:45<09:45, 585.01s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/2
Epoch 2/2




 50%|█████     | 1/2 [02:53<02:53, 173.79s/it][A[A

Epoch 1/2
Epoch 2/2




100%|██████████| 2/2 [05:47<00:00, 173.64s/it][A[A

 50%|█████     | 1/2 [05:47<05:47, 347.28s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A

Epoch 1/2
Epoch 2/2




 50%|█████     | 1/2 [03:02<03:02, 182.35s/it][A[A

Epoch 1/2
Epoch 2/2




100%|██████████| 2/2 [06:05<00:00, 182.58s/it][A[A

100%|██████████| 2/2 [11:52<00:00, 356.22s/it][A
100%|██████████| 2/2 [21:37<00:00, 648.73s/it]



total run time: 21.624282042185467 mins





In [38]:
resLong = res_CNN.melt(id_vars=['layer1', 'layer2', 'layer3', 'epoch'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

layer1_32 = alt.Chart(resLong[resLong.layer1==32]).mark_line().encode(
    row = 'layer2',
    column = 'layer3',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.6,0.8])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

layer1_128 = alt.Chart(resLong[resLong.layer1==128]).mark_line().encode(
    row = 'layer2',
    column = 'layer3',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.6,0.8])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

layer1_32 | layer1_128

In [39]:
# generate report result

res_CNN_report = pd.DataFrame(columns=['rand', 'trainScore_accuracy', 'trainScore_recall',
                           'testScore_accuracy', 'testScore_recall'])

for rand in [10, 20, 30, 40, 50]:
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X_train_indice.numpy(), y_train_indice, test_size=0.2, random_state=rand)
    
    input_shape = X_train_.shape[1]
    
    inputs = keras.Input(shape=(input_shape,), dtype="int64")

    x = layers.Embedding(40000, 128)(inputs)
    x = layers.Dropout(0.2)(x)

    x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
    x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
    x = layers.GlobalMaxPooling1D()(x)

    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.2)(x)

    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', metrics=['accuracy', 'Recall'], optimizer='adam')
    model.fit(x=X_train_, y=y_train_.values, validation_data=(X_test_, y_test_.values),
              epochs=2, batch_size=1024, verbose=1)

    history = model.history.history
    
    res_CNN_report = res_CNN_report.append({'rand':rand, 'trainScore_accuracy':history['accuracy'][1],
                      'trainScore_recall':history['recall'][1],
                      'testScore_accuracy':history['val_accuracy'][1],
                      'testScore_recall':history['val_recall'][1],}, ignore_index=True)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [None]:
res_CNN_report.describe()

Unnamed: 0,rand,trainScore_accuracy,trainScore_recall,testScore_accuracy,testScore_recall
count,5.0,5.0,5.0,5.0,5.0
mean,30.0,0.754349,0.753173,0.702359,0.65243
std,15.811388,0.001627,0.002677,0.001847,0.030764
min,10.0,0.75262,0.751083,0.699294,0.609005
25%,20.0,0.752788,0.751455,0.702248,0.647069
50%,30.0,0.7547,0.752717,0.702653,0.652369
75%,40.0,0.755214,0.752851,0.703763,0.65844
max,50.0,0.756421,0.757759,0.703838,0.695267


### c) LSTM

In [None]:
start_time = time.time()

layers1 = [16, 64]
layers2 = [16, 64]
input_shape = 500
max_features = 40000
embedding_dim = 128
ep_size = 2
batch_size = 1024

res_LSTM = pd.DataFrame(columns=['layer1', 'layer2', 'epoch', 'trainScore', 'testScore'])

for units_1 in tqdm(layers1):
    for units_2 in tqdm(layers2):
            
        inputs = keras.Input(shape=(input_shape,), dtype="int32")
        
        x = layers.Embedding(max_features, embedding_dim)(inputs)
        x = layers.Bidirectional(layers.LSTM(units_1, return_sequences=True))(x)
        x = layers.Bidirectional(layers.LSTM(units_2))(x)

        outputs = layers.Dense(1, activation="sigmoid")(x)
        
        model = keras.Model(inputs, outputs)
        model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
        model.fit(x=X_train_indice, y=y_train_indice.values, validation_data=(X_test_indice, y_test_indice.values),
                      epochs=ep_size, batch_size=batch_size, verbose=1)

        history = model.history.history

        for ep in np.arange(ep_size):
            res_LSTM = res_LSTM.append({'layer1':units_1, 'layer2':units_2, 'epoch':ep+1,
                              'trainScore':history['accuracy'][ep],
                              'testScore':history['val_accuracy'][ep]}, ignore_index=True)
            
# get run time
end_time = time.time()
delta_time = end_time - start_time
print('\n')
print("total run time: {} mins".format(delta_time/60))

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/2
Epoch 2/2



 50%|█████     | 1/2 [12:08<12:08, 728.38s/it][A

Epoch 1/2
Epoch 2/2



100%|██████████| 2/2 [29:19<00:00, 879.79s/it][A
 50%|█████     | 1/2 [29:19<29:19, 1759.58s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/2
Epoch 2/2



 50%|█████     | 1/2 [22:18<22:18, 1338.04s/it][A

Epoch 1/2
Epoch 2/2



100%|██████████| 2/2 [50:14<00:00, 1507.26s/it][A
100%|██████████| 2/2 [1:19:34<00:00, 2387.05s/it]



total run time: 79.56831512451171 mins





In [None]:
resLong = res_LSTM.melt(id_vars=['layer1', 'layer2', 'epoch'], value_vars=['trainScore', 'testScore'],
         var_name='group', value_name='score')

alt.Chart(resLong).mark_line().encode(
    row = 'layer1',
    column = 'layer2',
    x = 'epoch',
    y = alt.Y('score', scale=alt.Scale(domain=[0.65,0.75])),
    color = 'group',
    tooltip = 'score'
).properties(width=50, height=50)

In [None]:
# generate report result

res_LSTM_report = pd.DataFrame(columns=['rand', 'trainScore_accuracy', 'trainScore_recall',
                           'testScore_accuracy', 'testScore_recall'])

for rand in [10, 20, 30, 40, 50]:
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X_train_indice.numpy(), y_train_indice, test_size=0.2, random_state=rand)
    
    input_shape = X_train_.shape[1]
    
    inputs = keras.Input(shape=(input_shape,), dtype="int64")

    x = layers.Embedding(40000, 128)(inputs)
    x = layers.Bidirectional(layers.LSTM(16, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(16))(x)

    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, outputs)
    model.compile(loss='binary_crossentropy', metrics=['accuracy', 'Recall'], optimizer='adam')
    model.fit(x=X_train_indice, y=y_train_indice.values, validation_data=(X_test_indice, y_test_indice.values),
                  epochs=2, batch_size=1024, verbose=1)

    history = model.history.history
    
    res_LSTM_report = res_LSTM_report.append({'rand':rand, 'trainScore_accuracy':history['accuracy'][1],
                      'trainScore_recall':history['recall'][1],
                      'testScore_accuracy':history['val_accuracy'][1],
                      'testScore_recall':history['val_recall'][1],}, ignore_index=True)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [None]:
res_LSTM_report.describe()

Unnamed: 0,rand,trainScore_accuracy,trainScore_recall,testScore_accuracy,testScore_recall
count,5.0,5.0,5.0,5.0,5.0
mean,30.0,0.728206,0.73815,0.702303,0.721706
std,15.811388,0.001119,0.002524,0.001251,0.021069
min,10.0,0.726397,0.734531,0.701034,0.70094
25%,20.0,0.72829,0.736631,0.701154,0.706265
50%,30.0,0.728365,0.739031,0.70251,0.713052
75%,40.0,0.728503,0.739924,0.702762,0.74061
max,50.0,0.729477,0.740632,0.704057,0.747662


### d) Transformer

In [None]:
## Implement a transformer blocker

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
## Implement embedding layer
## two layers, one for token, one for positions

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
embed_dim_list = [32]  # Embedding size for each token
num_heads_list = [2]  # Number of attention heads
ff_dim_list = [32]  # Hidden layer size in feed forward network inside transformer
maxlen = 500
vocab_size = 40000
ep_size = 3
batch_size = 1024

res = pd.DataFrame(columns=['embed_dim', 'num_heads', 'ff_dim',
                            'epoch', 'trainScore', 'testScore'])

for embed_dim in tqdm(embed_dim_list):
    for num_heads in tqdm(num_heads_list):
        for ff_dim in tqdm(ff_dim_list):

            inputs = layers.Input(shape=(maxlen,))

            embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
            x = embedding_layer(inputs)
            transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
            x = transformer_block(x)
            x = layers.GlobalAveragePooling1D()(x)
            x = layers.Dropout(0.1)(x)
            x = layers.Dense(20, activation="relu")(x)
            x = layers.Dropout(0.1)(x)

            outputs = layers.Dense(1, activation="sigmoid")(x)

            model = keras.Model(inputs=inputs, outputs=outputs)
            model.compile(loss='binary_crossentropy', metrics=['accuracy'],
                          optimizer='adam')
            model.fit(x=X_train_indice, y=y_train_indice.values,
                      validation_data=(X_test_indice, y_test_indice.values),
                      epochs=ep_size, batch_size=batch_size, verbose=1)

            history = model.history.history

            for ep in np.arange(ep_size):
                res = res.append({'embed_dim':embed_dim, 'num_heads':num_heads,
                                  'ff_dim':ff_dim,'epoch':ep+1,
                                  'trainScore':history['accuracy'][ep],
                                  'testScore':history['val_accuracy'][ep]},
                                  ignore_index=True)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch 1/3
Epoch 2/3
Epoch 3/3




100%|██████████| 1/1 [1:03:32<00:00, 3812.98s/it][A[A

100%|██████████| 1/1 [1:03:32<00:00, 3812.98s/it][A
100%|██████████| 1/1 [1:03:32<00:00, 3812.98s/it]
