<a href="https://colab.research.google.com/github/martinacaramaschi/TPT-PE-thematic-analysis/blob/main/07_Tokenize_MakeBigrams_github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Print out  all expressions
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #default 'last_expr'
# Wider cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Imports
import pickle
import pandas as pd
import numpy as np

import nltk
nltk.download('omw-1.4')
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)   #required by word_tokenize method
nltk.download('averaged_perceptron_tagger',quiet=True) #required by pos_tag method

#Import regular expressions, for data processing
import re

import gensim

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

True

True

True

In [None]:
directory_name = '/content/drive/MyDrive/Colab Notebooks/TPT_PE_review/'

tpt_df = pd.read_pickle(directory_name + '06_filtered_TPT_V1.pkl')
pe_df = pd.read_pickle(directory_name + '06_filtered_PE_V1.pkl')

In [None]:
print(tpt_df.shape)
pe_df.shape

(7203, 39)


(6445, 16)

In [None]:
tpt_df.columns
pe_df.columns

Index(['level_0', 'index', 'filename', 'year', 'title', 'author_list',
       'volume', 'issue', 'processed_len', 'page', 'page_len', 'overlap',
       'pdf2fix', 'pdf_pages', 'overlapnext', 'overlapprev', 'URL',
       'processed', 'raw', 'page_start', 'page_end', 'publisher',
       'filename_orig', 'subtitle', 'authors', 'author', 'editor',
       'reference-count', 'is-referenced-by-count', 'issued', 'link', 'doi',
       'fulltext', 'first_n_words', 'cleaned_fulltext', 'word_count',
       'extracted_text', 'flag_before', 'extracted_text_processedby06'],
      dtype='object')

Index(['index', 'title', 'authors', 'publication_year', 'doi', 'volume',
       'issue', 'fpage', 'lpage', 'pdf_filename', 'zip_filename', 'fulltext',
       'word_count', 'extracted_text', 'flag_before',
       'extracted_text_processedby06'],
      dtype='object')

## Creating the complete dataset TPT+PE

In [None]:
# cretes a new df containing all tpt_df and pe_df, only including specific columns
new_tpt_df = tpt_df[['index', 'title', 'authors', 'year', 'doi', 'volume', 'issue',
                     'filename', 'fulltext', 'extracted_text', 'extracted_text_processedby06'
                     ]].rename(columns={'filename': 'pdf_filename'}).copy()
new_pe_df = pe_df[['index', 'title', 'authors', 'publication_year', 'doi', 'volume',
                   'issue', 'pdf_filename', 'fulltext', 'extracted_text', 'extracted_text_processedby06'
                   ]].rename(columns={'publication_year': 'year'}).copy()

new_tpt_df['journal'] = 'TPT'
new_pe_df['journal'] = 'PE'

In [None]:
new_tpt_df.head()

In [None]:
new_pe_df.head()

## Creating the combined dataset

In [None]:
# combine the two df
combined_df = pd.concat([new_tpt_df, new_pe_df], ignore_index=True)

# shuffle the elements of combined_df and call them combined_pe_tpt_df
combined_pe_tpt_df = combined_df.sample(frac=1).reset_index(drop=True)

In [None]:
combined_pe_tpt_df.shape

(13648, 12)

### **Saving combined dataset**

In [None]:
combined_pe_tpt_df.to_pickle(directory_name + '06_filtered_combined_V2.pkl')

In [None]:
combined_pe_tpt_df.head()

### **Datasets names**


*   TPT --> new_tpt_df
*   PE --> new_pe_df
*   TPT + PE --> combined_pe_tpt_df



## **Removing stopwords and stemming**

Now, we can remove the stopwords and do the stemming, leaving us with a list of documents, each of which is essentially a tokenized list of words.

In [None]:
print("TPT:\n", new_tpt_df.columns)
print("PE:\n", new_pe_df.columns)
print("TPT + PE:\n",combined_pe_tpt_df.columns)

TPT:
 Index(['index', 'title', 'authors', 'year', 'doi', 'volume', 'issue',
       'pdf_filename', 'fulltext', 'extracted_text',
       'extracted_text_processedby06', 'journal'],
      dtype='object')
PE:
 Index(['index', 'title', 'authors', 'year', 'doi', 'volume', 'issue',
       'pdf_filename', 'fulltext', 'extracted_text',
       'extracted_text_processedby06', 'journal'],
      dtype='object')
TPT + PE:
 Index(['index', 'title', 'authors', 'year', 'doi', 'volume', 'issue',
       'pdf_filename', 'fulltext', 'extracted_text',
       'extracted_text_processedby06', 'journal'],
      dtype='object')


In [None]:
field='extracted_text_processedby06'

In [None]:
def sent_to_words(list_sentences):
    return [gensim.utils.simple_preprocess(str(sentence), deacc=True) for sentence in list_sentences]  #deacc=True removes accent marks from tokens (incl. punctuations)

def remove_stopwords(tokens):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in STOPWORDS ] for doc in tokens]

def get_wordnet_pos(word): #Provide a POS tag
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) #return NOUN by default

def lemmatize_token(token):
    return nltk.stem.WordNetLemmatizer().lemmatize(token, get_wordnet_pos(token))

def lemmatize(token_list):
    '''Input example: ["he", "matches", "the", "profile"]'''
    return [lemmatize_token(token) for token in token_list]

In [None]:
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
!pip install nltk
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet',quiet=True)
#nltk.download('punkt',quiet=True)   #required by word_tokenize method
nltk.download('averaged_perceptron_tagger',quiet=True) #required by pos_tag method

In [None]:
#Tokenize documents
#data_words_tpt = sent_to_words(new_tpt_df[field])
#data_words_pe = sent_to_words(new_pe_df[field])
data_words_combined = sent_to_words(combined_pe_tpt_df[field])

In [None]:
%%time
#Remove stopwords
from gensim.parsing.preprocessing import STOPWORDS
#data_words_tpt_nostops = remove_stopwords(data_words_tpt)
#data_words_pe_nostops = remove_stopwords(data_words_pe)
data_words_combined_nostops = remove_stopwords(data_words_combined)

CPU times: user 58.8 s, sys: 879 ms, total: 59.6 s
Wall time: 1min 1s


In [None]:
nltk.download('averaged_perceptron_tagger_eng',quiet=True)

True

### **Lemmatizing**

In [None]:
#TPT
%%time
data_words_tpt_lemmatized = [lemmatize(token_list) for token_list in data_words_tpt_nostops]

CPU times: user 11min 2s, sys: 2.53 s, total: 11min 4s
Wall time: 11min 28s


In [None]:

#PE
%%time
data_words_pe_lemmatized = [lemmatize(token_list) for token_list in data_words_pe_nostops]

CPU times: user 11min 50s, sys: 2.25 s, total: 11min 52s
Wall time: 12min 1s


In [None]:
#combined
%%time
data_words_combined_lemmatized = [lemmatize(token_list) for token_list in data_words_combined_nostops]

CPU times: user 22min 44s, sys: 4.17 s, total: 22min 48s
Wall time: 23min 9s


Checking dimension of the combined dataset

In [None]:
#len(data_words_tpt_lemmatized)
#len(data_words_pe_lemmatized)
len(data_words_combined_lemmatized)

13648

### **Making bigrams**

In [None]:
#TPT
phrases = gensim.models.phrases.Phrases(data_words_tpt_lemmatized, min_count=10)
bigram = gensim.models.phrases.Phraser(phrases)
#bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(list_tokenized_docs):
    '''Input example: [['He','matches','the','profile']]'''
    return [bigram[doc] for doc in list_tokenized_docs]

# Form Bigrams
data_words_bigrams_tpt = make_bigrams(data_words_tpt_lemmatized)

In [None]:
#PE
phrases = gensim.models.phrases.Phrases(data_words_pe_lemmatized, min_count=10)
bigram = gensim.models.phrases.Phraser(phrases)
#bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(list_tokenized_docs):
    '''Input example: [['He','matches','the','profile']]'''
    return [bigram[doc] for doc in list_tokenized_docs]

data_words_bigrams_pe = make_bigrams(data_words_pe_lemmatized)

In [None]:
#combined
phrases = gensim.models.phrases.Phrases(data_words_combined_lemmatized, min_count=10)
bigram = gensim.models.phrases.Phraser(phrases)
#bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(list_tokenized_docs):
    '''Input example: [['He','matches','the','profile']]'''
    return [bigram[doc] for doc in list_tokenized_docs]

data_words_bigrams_combined = make_bigrams(data_words_combined_lemmatized)

### **Saving bigrams**

In [None]:
#with open(directory_name+'07_bigrams_TPT_V1.pkl','wb') as output: pickle.dump(data_words_bigrams_tpt,output)
#with open(directory_name+'07_bigrams_PE_V1.pkl','wb') as output: pickle.dump(data_words_bigrams_pe,output)
with open(directory_name+'07_bigrams_combined_V2.pkl','wb') as output: pickle.dump(data_words_bigrams_combined,output)