<a href="https://colab.research.google.com/github/manoghnagobbilla24/NLP-/blob/main/4078_ass_5_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

using nltk

In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Load the dataset
df = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000)
display(df.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [12]:
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|https\S+|www\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove social media mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove emojis (comprehensive pattern)
    emoji_pattern = re.compile(
        "[" # Start of character group
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    # Remove special characters (keep alphanumeric and spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['processed_summaries'] = df['summaries'].apply(preprocess_text)
display(df[['summaries', 'processed_summaries']].head())

Unnamed: 0,summaries,processed_summaries
0,Stereo matching is one of the widely used tech...,stereo matching is one of the widely used tech...
1,The recent advancements in artificial intellig...,the recent advancements in artificial intellig...
2,"In this paper, we proposed a novel mutual cons...",in this paper we proposed a novel mutual consi...
3,Consistency training has proven to be an advan...,consistency training has proven to be an advan...
4,"To ensure safety in automated driving, the cor...",to ensure safety in automated driving the corr...


In [13]:
df['tokenized_summaries'] = df['processed_summaries'].apply(nltk.word_tokenize)
display(df[['processed_summaries', 'tokenized_summaries']].head())

Unnamed: 0,processed_summaries,tokenized_summaries
0,stereo matching is one of the widely used tech...,"[stereo, matching, is, one, of, the, widely, u..."
1,the recent advancements in artificial intellig...,"[the, recent, advancements, in, artificial, in..."
2,in this paper we proposed a novel mutual consi...,"[in, this, paper, we, proposed, a, novel, mutu..."
3,consistency training has proven to be an advan...,"[consistency, training, has, proven, to, be, a..."
4,to ensure safety in automated driving the corr...,"[to, ensure, safety, in, automated, driving, t..."


In [14]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['filtered_summaries'] = df['tokenized_summaries'].apply(remove_stopwords)
display(df[['tokenized_summaries', 'filtered_summaries']].head())

Unnamed: 0,tokenized_summaries,filtered_summaries
0,"[stereo, matching, is, one, of, the, widely, u...","[stereo, matching, one, widely, used, techniqu..."
1,"[the, recent, advancements, in, artificial, in...","[recent, advancements, artificial, intelligenc..."
2,"[in, this, paper, we, proposed, a, novel, mutu...","[paper, proposed, novel, mutual, consistency, ..."
3,"[consistency, training, has, proven, to, be, a...","[consistency, training, proven, advanced, semi..."
4,"[to, ensure, safety, in, automated, driving, t...","[ensure, safety, automated, driving, correct, ..."


In [16]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['lemmatized_summaries'] = df['filtered_summaries'].apply(lemmatize_tokens)
df['clean_summaries'] = df['lemmatized_summaries'].apply(lambda tokens: ' '.join(tokens))
display(df[['lemmatized_summaries', 'clean_summaries']].head())

Unnamed: 0,lemmatized_summaries,clean_summaries
0,"[stereo, matching, one, widely, used, techniqu...",stereo matching one widely used technique infe...
1,"[recent, advancement, artificial, intelligence...",recent advancement artificial intelligence ai ...
2,"[paper, proposed, novel, mutual, consistency, ...",paper proposed novel mutual consistency networ...
3,"[consistency, training, proven, advanced, semi...",consistency training proven advanced semisuper...
4,"[ensure, safety, automated, driving, correct, ...",ensure safety automated driving correct percep...


In [17]:
# Ensure all necessary NLTK data are downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) # Open Multilingual Wordnet for better lemmatization support

def nltk_preprocessing_pipeline(text):
    # Initial Text Preprocessing
    def preprocess_text(text):
        text = re.sub(r'http\S+|https\S+|www\S+', '', text) # Remove URLs
        text = re.sub(r'<.*?>', '', text) # Remove HTML tags
        text = re.sub(r'@\w+', '', text) # Remove social media mentions
        text = re.sub(r'#\w+', '', text) # Remove hashtags
        text = text.lower() # Convert to lowercase
        emoji_pattern = re.compile(
            "[" # Start of character group
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE
        )
        text = emoji_pattern.sub(r'', text)
        text = re.sub(r'[^a-z0-9\s]', '', text) # Remove special characters
        text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
        return text

    processed_text = preprocess_text(text)

    # Word Tokenization
    tokenized_words = nltk.word_tokenize(processed_text)

    # Stopword Removal
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in tokenized_words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # Rejoin words
    return ' '.join(lemmatized_words)

# Apply the unified pipeline to the original 'summaries' column
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)

# Display to compare with the step-by-step result
display(df[['summaries', 'clean_summaries', 'clean_summaries_pipeline']].head())

Unnamed: 0,summaries,clean_summaries,clean_summaries_pipeline
0,Stereo matching is one of the widely used tech...,stereo matching one widely used technique infe...,stereo matching one widely used technique infe...
1,The recent advancements in artificial intellig...,recent advancement artificial intelligence ai ...,recent advancement artificial intelligence ai ...
2,"In this paper, we proposed a novel mutual cons...",paper proposed novel mutual consistency networ...,paper proposed novel mutual consistency networ...
3,Consistency training has proven to be an advan...,consistency training proven advanced semisuper...,consistency training proven advanced semisuper...
4,"To ensure safety in automated driving, the cor...",ensure safety automated driving correct percep...,ensure safety automated driving correct percep...


In [18]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['lemmatized_summaries'] = df['filtered_summaries'].apply(lemmatize_words)
display(df[['filtered_summaries', 'lemmatized_summaries']].head())

Unnamed: 0,filtered_summaries,lemmatized_summaries
0,"[stereo, matching, one, widely, used, techniqu...","[stereo, matching, one, widely, used, techniqu..."
1,"[recent, advancements, artificial, intelligenc...","[recent, advancement, artificial, intelligence..."
2,"[paper, proposed, novel, mutual, consistency, ...","[paper, proposed, novel, mutual, consistency, ..."
3,"[consistency, training, proven, advanced, semi...","[consistency, training, proven, advanced, semi..."
4,"[ensure, safety, automated, driving, correct, ...","[ensure, safety, automated, driving, correct, ..."


using spacy

In [19]:
import pandas as pd
import re
import spacy

# Load the spaCy English model
# If 'en_core_web_sm' is not downloaded, uncomment and run: !python -m spacy download en_core_web_sm
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm')

print("spaCy model loaded successfully!")

spaCy model loaded successfully!


In [20]:
# Load the dataset
df_spacy = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000)
display(df_spacy.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [21]:
def preprocess_text_spacy(text):
    # Remove URLs
    text = re.sub(r'http\S+|https\S+|www\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove social media mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove emojis (comprehensive pattern)
    emoji_pattern = re.compile(
        "[" # Start of character group
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    # Remove special characters (keep alphanumeric and spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_spacy['processed_summaries_spacy'] = df_spacy['summaries'].apply(preprocess_text_spacy)
display(df_spacy[['summaries', 'processed_summaries_spacy']].head())

Unnamed: 0,summaries,processed_summaries_spacy
0,Stereo matching is one of the widely used tech...,stereo matching is one of the widely used tech...
1,The recent advancements in artificial intellig...,the recent advancements in artificial intellig...
2,"In this paper, we proposed a novel mutual cons...",in this paper we proposed a novel mutual consi...
3,Consistency training has proven to be an advan...,consistency training has proven to be an advan...
4,"To ensure safety in automated driving, the cor...",to ensure safety in automated driving the corr...


In [22]:
def spacy_pipeline(text):
    doc = nlp(text)
    # Filter out stopwords, punctuation, and non-alphabetic tokens
    lemmas = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
    return ' '.join(lemmas)

df_spacy['clean_summaries_spacy'] = df_spacy['processed_summaries_spacy'].apply(spacy_pipeline)
display(df_spacy[['summaries', 'clean_summaries_spacy']].head())

Unnamed: 0,summaries,clean_summaries_spacy
0,Stereo matching is one of the widely used tech...,stereo matching widely technique infer depth s...
1,The recent advancements in artificial intellig...,recent advancement artificial intelligence ai ...
2,"In this paper, we proposed a novel mutual cons...",paper propose novel mutual consistency network...
3,Consistency training has proven to be an advan...,consistency training prove advanced semisuperv...
4,"To ensure safety in automated driving, the cor...",ensure safety automated drive correct percepti...
