# EPS711: Text and Sentiment Analysis

## Week 3: Assignment

To start this week's assignment, run below and load the dataset. 

In [7]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load the full 20 Newsgroups dataset from sklearn
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Create a pandas DataFrame from the dataset
df = pd.DataFrame({'content': newsgroups.data, 'category': newsgroups.target})

# Add category names for easier understanding
df['category_name'] = df['category'].apply(lambda x: newsgroups.target_names[x])

# Display the initial DataFrame
print("Initial DataFrame with all articles:")
print(df.head())

Initial DataFrame with all articles:
                                             content  category  \
0  I was wondering if anyone out there could enli...         7   
1  A fair number of brave souls who upgraded thei...         4   
2  well folks, my mac plus finally gave up the gh...         4   
3  \nDo you have Weitek's address/phone number?  ...         1   
4  From article <C5owCB.n3p@world.std.com>, by to...        14   

           category_name  
0              rec.autos  
1  comp.sys.mac.hardware  
2  comp.sys.mac.hardware  
3          comp.graphics  
4              sci.space  


In [10]:
# Sample 50 articles per category
df_subset = pd.concat(
    [group.sample(n=50, random_state=12) for _, group in df.groupby('category')]
).reset_index(drop=True)

# Display the subset DataFrame
print("\nDataFrame with 50 articles per category:")
print(df_subset.head())
print(df_subset.tail())

print("\nNumber of articles in each category:")
print(df_subset['category_name'].value_counts())



DataFrame with 50 articles per category:
                                             content  category category_name
0  #>So instead of calling it interest on deposit...         0   alt.atheism
1  \n  Actually, my atheism is based on ignorance...         0   alt.atheism
2  \nProbably we would have much the same problem...         0   alt.atheism
3  frank@D012S658.uucp (Frank O'Dwyer) writes ......         0   alt.atheism
4  \nDidn't you hear?  His address has changed.  ...         0   alt.atheism
                                               content  category  \
995  \n#In <mcclaryC5snpq.KB1@netcom.com> mcclary@n...        19   
996  Walter-\n\nI tried several times in the past t...        19   
997  \nCan we assume from this statement that you a...        19   
998  \nJim, please, that's a lame explanation of th...        19   
999  PSA 145:9  The LORD is good to all: and his  t...        19   

          category_name  
995  talk.religion.misc  
996  talk.religion.misc  
997  talk

In [12]:
# Group by 'category' and sample 50 articles from each category
### df_subset = df.groupby('category').apply(lambda x: x.sample(n=50, random_state=12), include_groups=False).reset_index(drop=True) 
### the line above causes the following error due to "include_groups=False": /var/folders/nv/37z9hwt54flbq13cryjqrzm40000gn/T/ipykernel_60088/1030170394.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
### alternative 1: 
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

# Load dataset
newsgroups = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'))
df = pd.DataFrame({
    'content': newsgroups.data,
    'category': newsgroups.target
})
df['category_name'] = df['category'].apply(lambda i: newsgroups.target_names[i])

# Build df_subset with 50 samples per category (stable across pandas versions)
df_subset = pd.concat(
    [g.sample(n=50, random_state=12) for _, g in df.groupby('category')]
).reset_index(drop=True)

# Sanity checks
print(df_subset.head(2))
print(df_subset['category_name'].value_counts())
### alternative 2:
### df_subset = (df.groupby('category', group_keys=False).apply(lambda x: x.sample(n=50, random_state=12)) .reset_index(drop=True)
# Prevents inclusion of grouping columns
 # Sample 50 rows from each group
### warning message again: /var/folders/nv/37z9hwt54flbq13cryjqrzm40000gn/T/ipykernel_60088/1243261900.py:9: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
# apply(lambda x: x.sample(n=50, random_state=12))  
### alternative 3: 
### df_subset = (df.groupby('category').apply(lambda x: x.sample(n=50, random_state=12)[['content', 'category', 'category_name']]).reset_index(drop=True))

### alternative 4: which works: 
#df_subset = (pd.concat([group.sample(n=50, random_state=12) for _, group in df.groupby('category')]).reset_index(drop=True))

###/var/folders/nv/37z9hwt54flbq13cryjqrzm40000gn/T/ipykernel_60088/2251538078.py:15: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. .apply(lambda x: x.sample(n=50, random_state=12)[['content', 'category', 'category_name']])


                                             content  category category_name
0  #>So instead of calling it interest on deposit...         0   alt.atheism
1  \n  Actually, my atheism is based on ignorance...         0   alt.atheism
category_name
alt.atheism                 50
comp.graphics               50
talk.politics.misc          50
talk.politics.mideast       50
talk.politics.guns          50
soc.religion.christian      50
sci.space                   50
sci.med                     50
sci.electronics             50
sci.crypt                   50
rec.sport.hockey            50
rec.sport.baseball          50
rec.motorcycles             50
rec.autos                   50
misc.forsale                50
comp.windows.x              50
comp.sys.mac.hardware       50
comp.sys.ibm.pc.hardware    50
comp.os.ms-windows.misc     50
talk.religion.misc          50
Name: count, dtype: int64


The dataset above is borrowed from one of the sklearn datasets. You can learn more about the dataset from this document: [https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset). 

There are 20 categories of news articles in the dataset. Since the dataset is huge, I loaded only 50 articles per each category, so you have 1,000 articles to work with in total. I have already loaded the data in the dataframe `df_subset[]`, and I will create filenames for easier reference and also see how long each article is as below.

In [10]:
# Add filenames to the DataFrame for easier reference
# Include category name in the filename for better identification
# Add content length for analysis purposes
df_subset['filename'] = df_subset.groupby('category_name').cumcount().astype(str).radd(df_subset['category_name'] + '_')
df_subset['content_length'] = df_subset['content'].apply(len)

# Display the updated DataFrame with filenames
print("\nDataFrame with filenames added:")
print(df_subset.head())
print(df_subset.tail())


DataFrame with filenames added:
                                             content  category category_name  \
0  #>So instead of calling it interest on deposit...         0   alt.atheism   
1  \n  Actually, my atheism is based on ignorance...         0   alt.atheism   
2  \nProbably we would have much the same problem...         0   alt.atheism   
3  frank@D012S658.uucp (Frank O'Dwyer) writes ......         0   alt.atheism   
4  \nDidn't you hear?  His address has changed.  ...         0   alt.atheism   

        filename  content_length  
0  alt.atheism_0            1412  
1  alt.atheism_1             372  
2  alt.atheism_2            1359  
3  alt.atheism_3            1893  
4  alt.atheism_4             191  
                                               content  category  \
995  \n#In <mcclaryC5snpq.KB1@netcom.com> mcclary@n...        19   
996  Walter-\n\nI tried several times in the past t...        19   
997  \nCan we assume from this statement that you a...        19   
998 

### Problem 1. Preprocess the data content and display the preprocessed data in the dataframe with the original data.
This subset of 50 articles per category is now ready for further text representation tasks. Let's first do preprocessing by removing unncessary symbols and stop words, lowercasing the letter, and tokenizing. Also, print the results of the preprocessing in the dataframe. 

In [13]:
# Add your code here
import re
# Simple preprocessing:
# - lowercase
# - remove URLs
# - keep only letters/spaces/apostrophes
# - collapse multiple spaces

url_pattern = re.compile(r'https?://\S+|www\.\S+')

def clean_text(s: str) -> str:
    s = s or ""
    s = s.lower()
    s = url_pattern.sub(' ', s)
    s = re.sub(r"[^a-z\s']", " ", s)   # keep letters, spaces, apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

df_subset['clean'] = df_subset['content'].apply(clean_text)

# Show preview
df_subset[['category_name', 'clean']].head(10)


Unnamed: 0,category_name,clean
0,alt.atheism,so instead of calling it interest on deposits ...
1,alt.atheism,actually my atheism is based on ignorance igno...
2,alt.atheism,probably we would have much the same problems ...
3,alt.atheism,frank d s uucp frank o'dwyer writes while i'll...
4,alt.atheism,didn't you hear his address has changed he can...
5,alt.atheism,if a person gives a well balanced reasoned arg...
6,alt.atheism,so that still leaves the door totally open for...
7,alt.atheism,last night while watching the a m rebroadcast ...
8,alt.atheism,most of post deleted there is an easy way out ...
9,alt.atheism,the quotation marks should enclose laws not mu...


### Problem 2. Create a bag of words using `CountVectorizer` and print the results in the form of a dataframe.

In [15]:
# Add your code from 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer
bow_vect = CountVectorizer(stop_words='english', max_features=5000)

# Fit and transform the clean text
X_bow = bow_vect.fit_transform(df_subset['clean'])

# Convert to DataFrame for easier inspection
bow_df = pd.DataFrame(X_bow.toarray(), columns=bow_vect.get_feature_names_out())

# Show first 10 rows
bow_df.head(10)

Unnamed: 0,aa,aan,abandoned,abc,abiding,ability,able,absolute,absolutely,abstract,...,zealand,zelepukin,zero,zijn,zip,zjp,zo,zone,zorg,zs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Problem 3. Create bigrams using `CountVectorizer` and print the results in the form of a dataframe.

In [16]:
# Add your code 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer for bigrams
bigram_vect = CountVectorizer(stop_words='english',
                              ngram_range=(2, 2),   # (min_n, max_n) = only bigrams
                              max_features=5000,    # limit vocabulary size
                              min_df=5)             # only keep bigrams that appear in ≥5 docs

# Fit and transform the clean text
X_bigrams = bigram_vect.fit_transform(df_subset['clean'])

# Convert to DataFrame
bigram_df = pd.DataFrame(X_bigrams.toarray(), columns=bigram_vect.get_feature_names_out())

# Show first 10 rows
bigram_df.head(10)

Unnamed: 0,able run,ac uk,alt atheism,anonymous ftp,answer question,anybody know,appreciated thanks,ask questions,available ftp,berkeley edu,...,video card,want know,want use,washington dc,weeks ago,window manager,windows dos,works fine,year old,years ago
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Problem 4-1. Create TF-IDF Representation using `TfidfVectorizer`

In [17]:
# Add your code here
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the clean text
X_tfidf = tfidf_vect.fit_transform(df_subset['clean'])

# Show the shape: (#documents, #features)
print("TF-IDF Matrix shape:", X_tfidf.shape)

# Convert to DataFrame for easier inspection (optional — large!)
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())

# Show first 10 rows
tfidf_df.head(10)

TF-IDF Matrix shape: (1000, 5000)


Unnamed: 0,aa,aan,abandoned,abc,abiding,ability,able,absolute,absolutely,abstract,...,zealand,zelepukin,zero,zijn,zip,zjp,zo,zone,zorg,zs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222638,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Problem 4-2. Display top 10 words from 10 random, different articles based on the TF-IDF results, and write down your explanation in your language. What are the 10 words and what does each TF-IDF score mean? Also, make a guess about what each document is about based on the TF-IDF scores. 

In [18]:
# Add your code here
import numpy as np

rng = np.random.default_rng(42)  # reproducible sampling
feature_names = np.array(tfidf_vect.get_feature_names_out())

# Pick 10 random, distinct documents
doc_indices = rng.choice(X_tfidf.shape[0], size=10, replace=False)

def top_terms_sparse_row(row_csr, k=10):
    """
    Given a 1 x N sparse CSR row, return top-k (term, score) pairs
    without densifying the whole vector.
    """
    data = row_csr.data
    idxs = row_csr.indices
    if data.size == 0:
        return []
    top_local = np.argsort(data)[-k:][::-1]
    terms = feature_names[idxs[top_local]]
    scores = data[top_local]
    return list(zip(terms, map(float, scores)))

def guess_topic_from_terms(terms, n=5):
    """Very simple one-line guess from top n terms."""
    return " | ".join([t for t, _ in terms[:n]])

for i, doc_id in enumerate(doc_indices, start=1):
    row = X_tfidf[doc_id]  # 1 x N sparse
    top_terms = top_terms_sparse_row(row, k=10)
    topic = df_subset.loc[doc_id, "category_name"]
    preview = df_subset.loc[doc_id, "content"][:120].replace("\n", " ")

    print(f"\n=== Document {i}  (index {doc_id})  |  Category: {topic} ===")
    print(f"Preview: {preview}...")
    print("Top 10 TF-IDF terms:")
    for term, score in top_terms:
        print(f"  {term:<20s} {score:.4f}")
    print("Topic guess:", guess_topic_from_terms(top_terms))


=== Document 1  (index 85)  |  Category: comp.graphics ===
Preview:  ...for very small values of six and nine....
Top 10 TF-IDF terms:
  values               0.8008
  small                0.5989
Topic guess: values | small

=== Document 2  (index 767)  |  Category: soc.religion.christian ===
Preview: : I may be wrong, but wasn't Jeff Fenholt part of Black Sabbath?  He's a : MAJOR brother in Christ now.  He totally chan...
Top 10 TF-IDF terms:
  sabbath              0.4808
  black                0.3492
  jeff                 0.3301
  christ               0.2525
  wasn                 0.2345
  wrong                0.1973
  bands                0.1709
  band                 0.1651
  witnessing           0.1651
  listening            0.1562
Topic guess: sabbath | black | jeff | christ | wasn

=== Document 3  (index 88)  |  Category: comp.graphics ===
Preview: Organization: "A World of Information at your Fingertips" Keywords:     Craig,  You should still consider the Targa+. I ...
Top 10 

Add your writing here