<a href="https://colab.research.google.com/github/martinacaramaschi/TPT-PE-thematic-analysis/blob/main/06_Filtering_github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Filtering for TPT/PE Articles

This script takes the scraped text, which has been filtered for specific articles that we need removed (like announcements) and overlapping text, and does additional filtering in preparation for lemmatization and bi-grams

In [None]:
# Print out  all expressions
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #default 'last_expr'
# Wider cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Imports
import pickle
import pandas as pd
import numpy as np

#Import regular expressions, for data processing
import re

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet',quiet=True)
#nltk.download('punkt',quiet=True)   #required by word_tokenize method
nltk.download('averaged_perceptron_tagger',quiet=True) #required by pos_tag method

## Reading the datafile

First, we import the pickle file that holds all of the scraped PDF data. This data is then put into a Pandas dataframe.

In [None]:
directory_name = '/content/drive/MyDrive/Colab Notebooks/TPT_PE_review/'

tpt_df = pd.read_pickle(directory_name + 'TPT_metadata_final.pkl')
tpt_df=tpt_df.reset_index();

In [None]:
pe_df = pd.read_pickle(directory_name + 'physics_education_metadata_final.pkl')
pe_df=pe_df.reset_index();


In [None]:
tpt_df.columns

Index(['level_0', 'index', 'filename', 'year', 'title', 'author_list',
       'volume', 'issue', 'processed_len', 'page', 'page_len', 'overlap',
       'pdf2fix', 'pdf_pages', 'overlapnext', 'overlapprev', 'URL',
       'processed', 'raw', 'page_start', 'page_end', 'publisher',
       'filename_orig', 'subtitle', 'authors', 'author', 'editor',
       'reference-count', 'is-referenced-by-count', 'issued', 'link', 'doi',
       'fulltext', 'first_n_words', 'cleaned_fulltext', 'word_count',
       'extracted_text', 'flag_before'],
      dtype='object')

In [None]:
tpt_df.head()

In [None]:
pe_df.columns

Index(['index', 'title', 'authors', 'publication_year', 'doi', 'volume',
       'issue', 'fpage', 'lpage', 'pdf_filename', 'zip_filename', 'fulltext',
       'word_count', 'extracted_text', 'flag_before'],
      dtype='object')

In [None]:
pe_df.head()

### Start processing

In [None]:
field='extracted_text'

In [None]:
def ML_process(text):
    filt_text = text
    #remove 'cid'
    filt_text = re.sub('\W(cid:\d{0,3})\W', '', filt_text) #Symbols such as @
    #remove some words in all-caps
    #USELESS NOW cause RawTextProcesser_* converted everything to lower case
    filt_text = re.sub(r'(?<=\W)(INTRODUCTION|CONCLUSION[S]?|BACKGROUND|ABSTRACT|ANALYSIS|EXPERIMENTAL|METHOD[S]?|METHODOLOGY|MOTIVATION[S]?|PRELIMINARY|RESULTS|APPLICATIONS|CONCLUDING|IMPLEMENTATION|EVALUATION|REMARKS|DISCUSSION[S]?|ACKNOWLEDGEMENTS|FUTURE PLANS|FUTURE WORK|FUTURE REASEARCH|SUMMARY|FIGURE[S]?|FIG|TABLE|I\.|II|III|IV|VI{0,3}|IX|X|XI{0,3})(?=\W)',
                       '', filt_text)
    #remove newlines, tabs, etc. also remove digits (\d) and bullet points (\uf0b7)
    filt_text = re.sub('[\t\n\r\f\v\d\uf0b7]', ' ', filt_text)
    #removes all special characters that aren't numbers or letters
    filt_text = re.sub('[^A-Za-z0-9]+', ' ', filt_text)
    #split lines
    filt_text = re.sub('- ', '', filt_text)
    #to lower case
    filt_text = filt_text.lower()

    #tlie -> the
    filt_text = re.sub(' tlie ', ' the ', filt_text)
    #per cent -> percent
    filt_text = re.sub(' per cent ', ' percent ', filt_text)
    # )ed -> fied
    #filt_text = re.sub(re.escape(' \)ed '), 'fied ', filt_text)
    # - cation -> cation
    #filt_text = re.sub('- cation ', 'cation ', filt_text)
    return filt_text

Now, we apply text processing to the entire dataset, creating a processed version.  We then print an example of the processed version

In [None]:
tpt_df['extracted_text_processedby06'] = tpt_df[field].map(lambda x: ML_process(x))

In [None]:
pe_df['extracted_text_processedby06'] = pe_df[field].map(lambda x: ML_process(x))

In [None]:
# Save the dataframes as pickle files
tpt_df.to_pickle(directory_name + '06_filtered_TPT_V1.pkl')
pe_df.to_pickle(directory_name + '06_filtered_PE_V1.pkl')

### Continue to 07 Tokenize_MakeBigrams.ipynb