In [None]:
# Required Libraries
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import numpy as np
import pandas as pd

# Getting english stopwords from NLTK library
stop_words = set(stopwords.words('english'))

# Method to remove special characters
def remove_special_characters(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

# Method to tokenize and remove stopwords from the text
def tokenize_and_remove_stopwords(text):
    text = word_tokenize(remove_special_characters(text))
    return ' '.join([word.lower() for word in text if word.lower() not in stop_words])

# Function to generate bigrams
def generate_bigrams(text):
    tokens = word_tokenize(remove_special_characters(text))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    bigrams = list(ngrams(tokens, 2))
    return ' '.join(['_'.join(bigram) for bigram in bigrams])

# Function to generate trigrams
def generate_trigrams(text):
    tokens = word_tokenize(remove_special_characters(text))
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    trigrams = list(ngrams(tokens, 3))
    return ' '.join(['_'.join(trigram) for trigram in trigrams])

# Preprocess method which calls the above methods and returns cleaned text, bigrams and trigrams
def preprocess_nltk(df):
    df = df.copy()
    
    df['cleaned_title'] = df['PaperTitle'].map(tokenize_and_remove_stopwords)
    df['title_bigrams'] = df['PaperTitle'].map(generate_bigrams)
    df['title_trigrams'] = df['PaperTitle'].map(generate_trigrams)
    
    df['cleaned_abstract'] = df['Abstract'].map(lambda x: tokenize_and_remove_stopwords(x) if pd.notnull(x) else np.nan)
    df['abstract_bigrams'] = df['Abstract'].map(lambda x: generate_bigrams(x) if pd.notnull(x) else np.nan)
    df['abstract_trigrams'] = df['Abstract'].map(lambda x: generate_trigrams(x) if pd.notnull(x) else np.nan)
    
    return df