# **Document Clustering**

In [10]:
import pandas as pd
dataset_url = 'https://raw.githubusercontent.com/neeharikasinghsjsu/cmpe255assignments/main/Clustering/dataset/g_document_clustering_movie_reviews.csv'
data = pd.read_csv(dataset_url)
print(data.head())

                                                text  label
0  Very silly movie, filled with stupid one liner...      0
1  As predictable as a Hallmark card, but not wit...      1
2  Only a 9/10 from me, a perfect ten would have ...      1
3  After Watergate, Vietnam and the dark days of ...      0
4  As long as you keep in mind that the productio...      0


**Data Preprocessing**

In [11]:
# Drop the label column
data = data.drop(columns=['label'])

data['text'] = data['text'].str.lower().str.replace('[^a-z0-9\s]', '', regex=True)

# Display the preprocessed data
print(data.head())



                                                text
0  very silly movie filled with stupid one liners...
1  as predictable as a hallmark card but not with...
2  only a 910 from me a perfect ten would have be...
3  after watergate vietnam and the dark days of t...
4  as long as you keep in mind that the productio...


**Generating Embeddings**

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Function to generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Generate embeddings for each document
embeddings = data['text'].apply(get_embedding)


**Document Clustering**

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Convert embeddings into a format suitable for clustering
embeddings_matrix = np.vstack(embeddings)

# Perform clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings_matrix)

# Assign the cluster labels to the data
data['cluster'] = kmeans.labels_

# Display the clustered data
print(data.head())


**Visualization and Analysis**

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce the dimensionality for visualization
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings_matrix)

# Plotting the clusters
plt.figure(figsize=(10, 8))
for i in range(n_clusters):
    plt.scatter(reduced_embeddings[data['cluster'] == i, 0], reduced_embeddings[data['cluster'] == i, 1], label=f'Cluster {i}')
plt.title('Document Clusters')
plt.legend()
plt.show()
