# PART 2

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

# Import the Porter stemmer
from nltk.stem.porter import *

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phuon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\phuon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phuon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### a. Initial cleaning
For later usage, skip this part and go to part b

In [3]:
# SKIP THIS PART IF DON'T WANT TO WAIT FOR THE DATA TO LOAD
tw_data = pd.read_csv('Twitter_Jan_Mar.csv')
tw_data.head()

KeyboardInterrupt: 

In [None]:
tw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500036 entries, 0 to 500035
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           500036 non-null  object 
 1   id             500030 non-null  object 
 2   content        500030 non-null  object 
 3   username       500002 non-null  object 
 4   like_count     499974 non-null  float64
 5   retweet_count  499974 non-null  float64
dtypes: float64(2), object(4)
memory usage: 22.9+ MB


In [None]:
# drop missing labels
tw_data = tw_data.dropna()

# remove unused columns
tw_data = tw_data.drop(columns=['id', 'username', 'retweet_count'])

# check if there is any missing data left
tw_data.isnull().sum()

date          0
content       0
like_count    0
dtype: int64

In [None]:
# Code copied from part 1 but made some changes on names/values

# Create a new column in our DF that contains token lists instead of raw text
def tokenize_text(df):
  df['tokens'] =  df["content"].apply(lambda x: x.split())

# Remove any URL tokens
def remove_url(df):
  df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if not re.match(r'http\S+', word)])

# NEW: Remove any additional tokens related to chatgpt
def remove_chatgpt(df):
  chatgpt_terms = {'chatgpt', 'gpt', 'gpt-4', 'gpt4', 'gpt3', 'openai'}

  df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in chatgpt_terms and 
                                                    not re.match(r'^[@#]\S+', word)])


# Remove all punctuation (,.?!;:’") and special characters(@, #, +, &, =, $, etc). Also, convert all tokens to lowercase only. 
def add_cleaned_tokens(df):
    cleaned_tokens = []
    for row in df['tokens']:
      cleaned_tokens.append([re.sub(r'[^a-zA-Z0-9]', '', t).lower() for t in row if re.sub(r'[^a-zA-Z0-9]', '', t)])
    df['cleaned_tokens'] = cleaned_tokens

# Stemm tokens by the Porter stememr
def stem_tokens(df):
  stemmer = PorterStemmer()
  df['stemmed_tokens'] = df['cleaned_tokens'].apply(lambda tokens: [stemmer.stem(t) for t in tokens])

# Remove stopwords in english
def remove_stopwords(df):
  sw = stopwords.words('english')
  tokens_no_sw = []
  for row in df['stemmed_tokens']:
    tokens_no_sw.append([t for t in row if t not in sw])
  df['tokens_no_sw'] = tokens_no_sw


In [None]:
# Process the data by going through all the steps from tokenization -> remove special characters -> stemming -> stopwords
def process_data(df):
  tokenize_text(df)
  remove_url(df)
  remove_chatgpt(df)
  add_cleaned_tokens(df)
  stem_tokens(df)
  remove_stopwords(df)
  return df

tw_data = process_data(tw_data)

tw_data.head()

Unnamed: 0,date,content,like_count,tokens,cleaned_tokens,stemmed_tokens,tokens_no_sw
0,2023-03-29 22:58:21+00:00,"Free AI marketing and automation tools, strate...",0.0,"[Free, AI, marketing, and, automation, tools,,...","[free, ai, marketing, and, automation, tools, ...","[free, ai, market, and, autom, tool, strategi,...","[free, ai, market, autom, tool, strategi, coll..."
1,2023-03-29 22:58:18+00:00,@MecoleHardman4 Chat GPT says it’s 15. 😂,0.0,"[Chat, says, it’s, 15., 😂]","[chat, says, its, 15]","[chat, say, it, 15]","[chat, say, 15]"
2,2023-03-29 22:57:53+00:00,https://t.co/FjJSprt0te - Chat with any PDF!\n...,0.0,"[-, Chat, with, any, PDF!, Check, out, how, th...","[chat, with, any, pdf, check, out, how, this, ...","[chat, with, ani, pdf, check, out, how, thi, n...","[chat, ani, pdf, check, thi, new, ai, quickli,..."
3,2023-03-29 22:57:52+00:00,"AI muses: ""In the court of life, we must all f...",0.0,"[AI, muses:, ""In, the, court, of, life,, we, m...","[ai, muses, in, the, court, of, life, we, must...","[ai, muse, in, the, court, of, life, we, must,...","[ai, muse, court, life, must, face, judg, dest..."
4,2023-03-29 22:57:26+00:00,Most people haven't heard of Chat GPT yet.\nFi...,0.0,"[Most, people, haven't, heard, of, Chat, yet.,...","[most, people, havent, heard, of, chat, yet, f...","[most, peopl, havent, heard, of, chat, yet, fi...","[peopl, havent, heard, chat, yet, first, elit,..."


In [None]:
# clear unused column, renamed the final cleaned column
tw_data['cleaned_content'] = tw_data['tokens_no_sw']
tw_data = tw_data.drop(columns=['tokens', 'cleaned_tokens', 'stemmed_tokens', 'tokens_no_sw'])

# convert to a csv file so that we don't have to run and wait raw data again
tw_data.to_csv("twitter_cleaned.csv", index = False)

### b. Vectorization and Clustering

In [5]:
# Run this instead of part (a) for future attempts
tw_data = pd.read_csv('twitter_cleaned.csv')
tw_data

Unnamed: 0,date,content,like_count,cleaned_content
0,2023-03-29 22:58:21+00:00,"Free AI marketing and automation tools, strate...",0.0,"['free', 'ai', 'market', 'autom', 'tool', 'str..."
1,2023-03-29 22:58:18+00:00,@MecoleHardman4 Chat GPT says it’s 15. 😂,0.0,"['chat', 'say', '15']"
2,2023-03-29 22:57:53+00:00,https://t.co/FjJSprt0te - Chat with any PDF!\n...,0.0,"['chat', 'ani', 'pdf', 'check', 'thi', 'new', ..."
3,2023-03-29 22:57:52+00:00,"AI muses: ""In the court of life, we must all f...",0.0,"['ai', 'muse', 'court', 'life', 'must', 'face'..."
4,2023-03-29 22:57:26+00:00,Most people haven't heard of Chat GPT yet.\nFi...,0.0,"['peopl', 'havent', 'heard', 'chat', 'yet', 'f..."
...,...,...,...,...
499969,2023-01-04 07:18:08+00:00,@GoogleAI #LAMDA Versus @OpenAI #ChatGPT ?! Wh...,1.0,"['versu', 'care', 'lamda', 'isnt', 'avail', 'r..."
499970,2023-01-04 07:17:50+00:00,#ChatGPT \n\nSo much #Censorship.\n\nNever tru...,2.0,"['much', 'never', 'trust', 'system', 'dont', '..."
499971,2023-01-04 07:17:20+00:00,all my twitter feed is about ChatGPT and @Open...,3.0,"['twitter', 'feed', 'lol']"
499972,2023-01-04 07:17:08+00:00,I'm quite amazed by Chat GPT. A really promisi...,1.0,"['im', 'quit', 'amaz', 'chat', 'gpt', 'realli'..."


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def override_fcn(doc):
  # We expect a list of tokens as input
  return doc

# Count Vectorizer
def count_vectorizer(df):
  X_df = df['cleaned_content'].to_numpy()
  vocab_count = X_df.shape[0]

  print(f"The length of vocabulary is: {vocab_count}")

  count_vec = CountVectorizer(
    analyzer='word',
    tokenizer= override_fcn,
    preprocessor= override_fcn,
    token_pattern= None,
    max_features = vocab_count)

  counts_combined = count_vec.fit_transform(X_df)
  counts = counts_combined[:len(X_df)]  # First part: Training data

  print(f"vec: {counts.toarray()}")
  print(f"vec shape: {counts.shape}")
  return counts

In [None]:
vec = count_vectorizer(tw_data)

The length of vocabulary is: 499974
vec: [[ 9 20  9 ...  0  0  0]
 [ 2  6  2 ...  0  1  0]
 [15 32 15 ...  0  0  0]
 ...
 [ 2  6  2 ...  0  0  0]
 [10 22 10 ...  0  0  1]
 [10 22 10 ...  0  1  0]]
vec shape: (499974, 41)


In [None]:
# # Vectorizing using TF-IDF vectors
# from sklearn.feature_extraction.text import TfidfVectorizer

# # TF-IDF Vectorizer
# vectorizer = TfidfVectorizer(lowercase=True,
#                                 #max_features=100,
#                                 # max_df=0.99,
#                                 # min_df=1,
#                                 ngram_range = (1,3),
#                                 stop_words = "english"
#                             )

# # fit the model to the tweet data
# vectors = vectorizer.fit_transform(tw_data)

# vectors.shape

(4, 4)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

# TF-IDF Vectorizer
def tfidf_vectorizer(counts):
    tfidf_data = tfidf.fit_transform(counts)
    print(f"TF-IDF vec shape: {tfidf_data.shape}")

    return tfidf_data

In [None]:
# Apply TF-IDF Vectorizer to the train and test data
tfidf_data = tfidf_vectorizer(vec)

TF-IDF vec shape: (499974, 41)


In [None]:
tfidf_data

<499974x41 sparse matrix of type '<class 'numpy.float64'>'
	with 10982348 stored elements in Compressed Sparse Row format>

## THE CODE BELOW IS NOT WORKING

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans

# Define the tokenizer that just returns the tokens (as they are already tokenized)
def override_fcn(doc):
    return doc  # The doc is already tokenized

# Assuming 'cleaned_content' is a column in your dataframe and contains tokenized words
count_vec = CountVectorizer(
    tokenizer=override_fcn,  # Use the pre-tokenized words as-is
    preprocessor=None,  # No need for preprocessing as it's already cleaned
    token_pattern=None,  # No token pattern, since tokens are already passed
    stop_words='english',  # You can still remove stopwords if needed
    ngram_range=(1, 3)  # Example: unigrams, bigrams, trigrams (you can adjust this)
)

# Transform the data into a count matrix
counts_combined = count_vec.fit_transform(tw_data['cleaned_content'])  # Your tokenized column

# Now apply TF-IDF transformation
tfidf = TfidfTransformer()
tfidf_data = tfidf.fit_transform(counts_combined)

# Clustering
k = 34  # Number of clusters

# Fit the KMeans model
model = KMeans(n_clusters=k, init="k-means++", max_iter=300, n_init=1)
model.fit(tfidf_data)

# Sort the centroid coordinates in descending order (most important words in each cluster)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = count_vec.get_feature_names_out()  # Get the actual feature names (words)

# Write the results to a file
with open("tweet_topic_chatgpt.txt", "w", encoding="utf-8") as f:
    for i in range(k):
        f.write(f"Cluster {i}\n")
        for ind in order_centroids[i, :10]:  # Top 10 words for each cluster
            f.write(f" {terms[ind]}\n")
        f.write("\n\n")




In [52]:
# Clustering
from sklearn.cluster import KMeans

# number of clusters
k = 34

# fit the k means model
model = KMeans(n_clusters=k, init="k-means++", max_iter=300, n_init=1)
model.fit(tfidf_data)

# sorts the centroid coordinates in descending order. most important words (highest weight in the cluster) come first
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names_out()

# write results to file
with open ("tweet_topic_chatgpt.txt", "w", encoding="utf-8") as f:
    for i in range(k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")