# Removing Stop Words

### Fetch Dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Fetch the dataset
newsgroups = fetch_20newsgroups(subset='all')
data = newsgroups.data
df = pd.DataFrame({'text': data})

### Remove Stopwords

In [3]:
stopwords = ['the', 'of', 'and', 'is', 'to', 'in', 'a', 'from', 'by', 'that', 'with', 'this', 'as', 'an', 'are', 'its', 'at', 'for']

Or using NLTK's Predefined List of Stopwords

In [5]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

# Set of English stopwords
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# Function to remove stopwords from text
def remove_stopwords(text):
    words = text.split()
    words_filtered = [word for word in words if word.lower() not in stopwords]
    return ' '.join(words_filtered)

# Apply the function to the DataFrame
df['text_without_stopwords'] = df['text'].apply(remove_stopwords)

### Analyze the Frequency of Remaining Words

In [7]:
from collections import Counter

# Combine all texts into one string
all_text = ' '.join(df['text_without_stopwords'])

# Calculate word frequency
word_counts = Counter(all_text.split())
most_common_words = word_counts.most_common(20)

print(most_common_words)

[('>', 46322), ('Subject:', 19317), ('From:', 19176), ('Lines:', 18880), ('Organization:', 18155), ('|', 16131), ('-', 15908), ('would', 13948), ('Re:', 13005), ('--', 12964), ('writes:', 12734), ('article', 10884), ('one', 10664), ('|>', 9908), ('like', 8512), ('get', 7731), ('University', 7610), (':', 7448), ('people', 7380), ('know', 7137)]
