In [1]:
import pandas as pd
import os
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
model=BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
import nltk
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import spacy
from gensim.models.word2vec import Word2Vec
nlp=spacy.load('en_core_web_sm')
import plotly.express as px
# from umap import UMAP
import warnings
import re
warnings.filterwarnings('ignore')
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
device=torch.device('mps')
model.to(device)
device

device(type='mps')

In [3]:
book={}
replace_dic={}
idx=-1
for index, val in enumerate(os.listdir('uncle-anti-tom')):
    if val.endswith('.txt'):
        idx+=1
        with open('uncle-anti-tom/'+val, 'r') as f:
            text = f.read()
            book[val]=text
        replace_dic[idx]=val

In [4]:
for tom in os.listdir('uncle-tom'):
    if tom.endswith('txt'):
        with open('uncle-tom/' + tom, 'r') as f:
            text = f.read()
            book[tom]=text
replace_dic[21]='uncle-tom'

In [5]:
df=pd.DataFrame.from_dict(book, orient='index').T
df.reset_index(drop=True, inplace=True)

In [6]:
clean_book={}
for col in tqdm(df.columns):
    sent=sent_tokenize(df[col][0])
    clean_sent= [re.sub(r'#|\n', '', sentence) for sentence in sent]
    alpha_sent = [re.sub(r'[^a-zA-Z\s]', '', sentence) for sentence in clean_sent]
    clean_book[col]=alpha_sent

  0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 22/22 [00:07<00:00,  3.00it/s]


In [7]:
def bert_index_tokenizer(sentences:list):
    indexed_tokens_list=[]
    for sentence in sentences:
        marked_sentence="[CLS] "+sentence+" [SEP]"
        tokenized_sentence=tokenizer.tokenize(marked_sentence)
        indexed_tokens=tokenizer.convert_tokens_to_ids(tokenized_sentence)
        indexed_tokens_list.append(indexed_tokens)
    return indexed_tokens_list

In [8]:
def bert_word_tokenizer(sentences:list):
    tokenized_list=[]
    for sentence in sentences:
        marked_text = "[CLS] " + sentence + " [SEP]"
        tokenized_sentence=tokenizer.tokenize(marked_text)
        tokenized_list.append(tokenized_sentence)
    return tokenized_list

In [9]:
def bert_vector(list_of_index_token:list):
    book_vec_dic={}
    for i in range(0, len(list_of_index_token)):
        tokens_tensor = torch.tensor([list_of_index_token[i]]).to(device)
        with torch.no_grad():
            outputs=model(tokens_tensor)
            hidden_states=outputs[2]
            book_vec_dic[i]=torch.stack(hidden_states).sum(0).cpu().numpy()
    return book_vec_dic

In [10]:
#85mins
book_vec={}
for title in tqdm(list(clean_book.keys())):#[:2]: # upto two books
    book_vec[title]=bert_vector(bert_index_tokenizer(clean_book[title]))

100%|██████████| 22/22 [1:16:42<00:00, 209.20s/it]


In [11]:
book_word={}
for title in tqdm(list(clean_book.keys())):#[:2]: #upto two books
    book_word[title]=bert_word_tokenizer(clean_book[title])

100%|██████████| 22/22 [00:18<00:00,  1.19it/s]


In [12]:
entire=[]
for title in tqdm(list(book_vec.keys())): #title
    book_vec_df=[]
    for sentence in range(0, len(book_vec[title])): #sentence
        word_vec_df=[]
        for word in range(0, book_vec[title][sentence].shape[1]):
            df_to_append=pd.DataFrame(book_vec[title][sentence][:, word, :])
            df_to_append['title']=title
            df_to_append['word']=book_word[title][sentence][word]
            word_vec_df.append(df_to_append)
        book_vec_df.append(pd.concat(word_vec_df))
    book_to_append=pd.concat(book_vec_df).reset_index(drop=True)
    value_counts=book_to_append['word'].value_counts()
    value_more_than_ten=value_counts[value_counts>10].index
    book_to_append=book_to_append[book_to_append['word'].isin(value_more_than_ten)]
    entire.append(book_to_append)

  0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 22/22 [15:07<00:00, 41.24s/it]


In [13]:
pd.concat(entire).reset_index(drop=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,title,word
0,-7.478304,-4.756771,-5.958851,-5.235272,2.862213,2.393917,-2.295559,0.520085,1.471202,-8.430141,...,-3.867614,-2.004578,5.145257,-5.758963,0.948897,0.619056,-1.807147,7.596557,randolph.txt,[CLS]
1,-0.537280,-0.226843,-0.334886,0.182568,-3.751504,-1.240217,-0.563245,-1.090282,-0.145662,-1.761338,...,0.793860,-0.423727,-0.406164,1.231070,-1.450891,-0.295335,1.091572,-0.884145,randolph.txt,[SEP]
2,3.045434,-3.357082,-3.924592,-3.129404,-2.760748,-0.652092,-1.263986,4.285102,3.265141,-8.454443,...,-3.207771,-3.413462,1.804200,-4.299966,3.043957,-1.140223,1.891386,6.512798,randolph.txt,[CLS]
3,-0.016290,-4.579349,-4.142798,-4.135424,-3.207198,1.268065,1.603544,5.255670,5.241398,-6.211156,...,6.130839,2.072432,1.286653,4.199144,-0.260763,-1.543000,0.816705,-0.013712,randolph.txt,the
4,-13.982450,2.749680,-4.193947,-2.990233,0.257156,2.482200,6.614248,-1.626773,4.822195,-1.589067,...,-2.227118,2.169677,8.157495,7.053337,5.976264,-1.611567,0.497688,3.584454,randolph.txt,of
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2244458,-8.757732,-0.626862,11.322623,0.229458,3.742632,-5.954138,9.657133,-2.302290,1.469036,3.032384,...,-5.699370,8.967522,-4.547657,-5.137463,2.763270,-14.738283,6.207281,1.850708,uncle-tom.txt,##s
2244459,-4.813586,6.151249,4.955924,1.011541,0.078901,-1.188979,8.478818,-0.837381,2.622007,1.865219,...,-1.180259,8.117636,2.415400,-2.124817,-2.632246,-5.830853,0.276635,-1.769075,uncle-tom.txt,the
2244460,-8.994316,10.452461,11.279652,-5.258018,9.092545,-3.832115,21.878525,0.100974,-7.190928,2.664742,...,-1.071562,5.628625,8.744093,3.669634,-2.472392,2.272782,-3.201776,5.471878,uncle-tom.txt,of
2244461,-0.943445,10.737455,-0.102151,-8.319107,8.044221,-9.827887,11.839226,5.585310,-11.499287,-9.764706,...,-15.822217,8.578966,7.323221,-17.004139,2.464071,2.511794,-5.484730,2.834504,uncle-tom.txt,god


In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
components = pca.fit_transform(pd.concat(entire).loc[:, 0:767])
# umap_2d=UMAP(n_components=2, random_state=42).fit_transform(pd.concat(entire).loc[:, 0:767])

In [15]:
components_df=pd.DataFrame(components)
components_df.rename(columns={0:'x', 1:'y'}, inplace=True)

In [16]:
reduced_merge=pd.merge(components_df, pd.concat(entire).reset_index(drop=True), left_index=True, right_index=True)

In [17]:
reduced_merge.head(2)

Unnamed: 0,x,y,0,1,2,3,4,5,6,7,...,760,761,762,763,764,765,766,767,title,word
0,-42.078911,107.210312,-7.478304,-4.756771,-5.958851,-5.235272,2.862213,2.393917,-2.295559,0.520085,...,-3.867614,-2.004578,5.145257,-5.758963,0.948897,0.619056,-1.807147,7.596557,randolph.txt,[CLS]
1,63.213482,59.882702,-0.53728,-0.226843,-0.334886,0.182568,-3.751504,-1.240217,-0.563245,-1.090282,...,0.79386,-0.423727,-0.406164,1.23107,-1.450891,-0.295335,1.091572,-0.884145,randolph.txt,[SEP]


In [18]:
no_masking=reduced_merge[(~reduced_merge['word'].str.contains('[CLS]]'))&(~reduced_merge['word'].str.contains('##'))&(~reduced_merge['word'].str.contains('[SEP]'))]

In [19]:
no_masking.head(2)

Unnamed: 0,x,y,0,1,2,3,4,5,6,7,...,760,761,762,763,764,765,766,767,title,word
3,-53.226841,-0.705614,-0.01629,-4.579349,-4.142798,-4.135424,-3.207198,1.268065,1.603544,5.25567,...,6.130839,2.072432,1.286653,4.199144,-0.260763,-1.543,0.816705,-0.013712,randolph.txt,the
4,-39.123138,-19.053034,-13.98245,2.74968,-4.193947,-2.990233,0.257156,2.4822,6.614248,-1.626773,...,-2.227118,2.169677,8.157495,7.053337,5.976264,-1.611567,0.497688,3.584454,randolph.txt,of


In [25]:
no_masking.reset_index(drop=True, inplace=True)
new_column_names = [str(col) for col in no_masking.columns]
no_masking.columns = new_column_names
no_masking.to_feather('anti-tom-no-masking.feather')