### Import LIbraries

In [64]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
import emoji
import nltk
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.corpus import words


### Load Tweets for Preprocessing


In [65]:
df = pd.read_csv('metoo_tweets_dec2017.csv',usecols=[1])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398670 entries, 0 to 398669
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    395561 non-null  object
dtypes: object(1)
memory usage: 3.0+ MB
None


### Filter Tweets taht don't have #METOO tag

In [66]:
df = df.dropna(subset=['text']) # Drop NaN rows. 
#df['text'] = df['text'].str.lower() # Convert to lower case
df = df[df['text'].str.contains('metoo', case=False, na=False)] # Filter rows that have #metoo and drop others. 
print(df.head())
print(df.info())

                                                text
0    American Harem.. #MeToo https://t.co/HjExLJdGuF
1  @johnconyersjr  @alfranken  why have you guys ...
3  Women have been talking about this crap the en...
4  .@BetteMidler please speak to this sexual assa...
5  We can't keep turning a blind eye and pretend ...
<class 'pandas.core.frame.DataFrame'>
Index: 338674 entries, 0 to 398669
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    338674 non-null  object
dtypes: object(1)
memory usage: 5.2+ MB
None


#### Preprocessing

In [67]:
def split_hashtag(hashtag):
    # Split based on capitalization patterns
    return ' '.join(re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?![a-z])', hashtag))

df = df.drop_duplicates(subset=['text']).reset_index(drop=True) # remove duplicates
df['text'] = df['text'].apply(lambda x: re.sub(r'(\w)(#)', r'\1 \2', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'#metoo', '', x, flags=re.IGNORECASE)) # Remove MeToo hashtag

df['text'] = df['text'].apply(lambda x:  # Find hashtags and split them into words
    ' '.join([split_hashtag(word[1:]) if word.startswith('#') else word for word in x.split()])
)
df['text'] = df['text'].str.lower() # lower case for easier processing


In [68]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138179 entries, 0 to 138178
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    138179 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
None
                                                text
0           american harem.. https://t.co/hjexljdguf
1  @johnconyersjr @alfranken why have you guys no...
2  women have been talking about this crap the en...
3  .@bettemidler please speak to this sexual assa...
4  we can't keep turning a blind eye and pretend ...


In [69]:
# Apply transformations to the 'text' column
df['text'] = df['text'].apply(lambda x: 
    emoji.demojize( # Convert emojis to words
        re.sub(r'[^a-zA-Z\s]', '', # REMOVE punctuation and numbers 
                      re.sub(r"\bmetoo\b", '', # REMOVE word metoo
                             re.sub(r"@\w+", '', # REMOVE mentions
                                    re.sub(r'http\S+|www\S+|https\S+', '',x) # REMOVE URLs
                                   
                            )
                     )
              )
    )
)

df['text'] = df['text'].apply(lambda x: re.sub(r'#', '', x)) # Remove hashtags

In [70]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138179 entries, 0 to 138178
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    138179 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
None


In [71]:
print(df.tail())

                                                     text
138174  chief justice john roberts orders misconduct r...
138175   need to start tweeting after this game ja xvs...
138176            what microsoft learned from our moment 
138177   say victims of sexual harassment in japan via...
138178  chief justice john roberts orders misconduct r...


### Further processing tweets for Topic Modeling

In [72]:
# Tokenizing the words
nltk.download('punkt_tab')
df['text'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [75]:
# Lemmatization to put words in its origin
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.head())

                                                text
0                                  [american, harem]
1  [why, have, you, guy, not, resigned, yet, libe...
2  [woman, have, been, talking, about, this, crap...
3  [please, speak, to, this, sexual, assault, by,...
4  [we, cant, keep, turning, a, blind, eye, and, ...


In [76]:
# Remove Stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
print(df.head())

                                                text
0                                  [american, harem]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


In [77]:
# Remove short words 
df['text'] = df['text'].apply(lambda x: [word for word in x if len(word) > 2])
print(df.head())

                                                text
0                                  [american, harem]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


In [23]:
from collections import Counter

# Flatten the list of all tokenized words in the 'text' column
all_words = [word for tokens in df['text'] for word in tokens]
word_counts = Counter(all_words)

# Set a frequency threshold
min_freq = 10 

# Filter the words based on this minimum frequency
df['text'] = df['text'].apply(lambda x: [word for word in x if word_counts[word] >= min_freq])

print(df.head())

                                                text
0                                         [american]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


##  Process Single-Word Documents

In [78]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np
from collections import defaultdict

# Filter out single-word documents
single_word_docs = df[df['text'].apply(len) == 1]
multi_word_docs = df[df['text'].apply(len) != 1]

# Extract single words (retain duplicates across the dataset)
single_words = [doc[0] for doc in single_word_docs['text']]

# Train Word2Vec on multi-word documents
multi_word_texts = multi_word_docs['text'].tolist()
model = Word2Vec(sentences=multi_word_texts, vector_size=100, min_count=1, workers=4)

# Filter single words to include only those present in the Word2Vec model's vocabulary
single_words = [word for word in single_words if word in model.wv]

# Get word embeddings for single words
word_vectors = np.array([model.wv[word] for word in single_words])

# Cluster single-word embeddings
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(word_vectors)

# Group single words into clusters
clusters = defaultdict(list)
for i, word in enumerate(single_words):
    clusters[kmeans.labels_[i]].append(word)

# Remove duplicates within each cluster/document
filtered_clusters = {cluster_id: list(set(words)) for cluster_id, words in clusters.items()}

# Create new documents from filtered clusters
new_documents = [" ".join(words) for words in filtered_clusters.values()]

# Combine new documents with the original multi-word documents
final_documents = multi_word_texts + [doc.split() for doc in new_documents]

# Update the DataFrame with the processed documents
df_processed = pd.DataFrame({'text': final_documents})

# Display the clusters and the processed DataFrame
print("Clusters:")
for cluster_id, words in filtered_clusters.items():
    print(f"Cluster {cluster_id}: {', '.join(words)}")

print("\nProcessed DataFrame:")
print(df_processed)

  super()._check_params_vs_input(X, default_n_init=10)


Clusters:
Cluster 3: sick, anyone, thats, feel, count, going, happened, damon, god, hell, trying, know, understand, cant, happen, anything, shes, away, answer, bullshit, sure, family, course, hand, try, ever, honest, shit, hug, love, used, something, everything, kid, life, nice, got, hey, really, shut, respect, sorry, told, sister, lady, hurt, funny, man, wish, raped, never, wonder, much, harassed, would, even, ill, exactly, well, worse, better, seriously, liberal, friend, actually, getting, damn, yeah, lost, sad, said, doe, wait, thinking, also, hate, joke, abused, everyone, thing, like, dude, girl, though, lol, bad, guess, hear, daughter, remember, mean, seems, done, youre, saying, still, fuck, gon, stay, guy, maybe, fucking, since, theyre, alone
Cluster 16: number, pay, care, problem, truth, real, clear, happens, act, matter, hollywood, abuse, consent, report, everywhere, isnt, attention, case, workplace, bill, way, amp, accusation, stop, continue, must, shame, justice, end, linda, 

### Save the preprocessed and tokenized words

In [79]:
df_processed.to_pickle('tokenized_tweets_lemmatzation.pkl')

### PORTER STEMMER 

In [12]:

stemmer = PorterStemmer() # Initialize Stemmer

df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])  #Stemming every word

print(df.head())

                                                text
0                                  [american, harem]
1               [guy, resign, yet, liber, hypocrisi]
2  [women, talk, crap, entir, time, final, someon...
3         [pleas, speak, sexual, assault, interview]
4  [cant, keep, turn, blind, eye, pretend, isnt, ...


In [13]:
# Remove short words and infrequent words
df['text'] = df['text'].apply(lambda x: [word for word in x if len(word) > 2])
print(df.head())

                                                text
0                                  [american, harem]
1               [guy, resign, yet, liber, hypocrisi]
2  [women, talk, crap, entir, time, final, someon...
3         [pleas, speak, sexual, assault, interview]
4  [cant, keep, turn, blind, eye, pretend, isnt, ...


In [66]:
# Create N-Grams for words that don't make sense individually
sentences = df['text'].tolist() # convert tokens back to sentences
bigram = Phrases(sentences, min_count=5, threshold=100) # define phrases
bigram_mod = Phraser(bigram) # initialize Phraser with bigram settings. 
df['text'] = df['text'].apply(lambda x: bigram_mod[x])
print(df.head())

                                                text
0                                  [american, harem]
1               [guy, resign, yet, liber, hypocrisi]
2  [women, talk, crap, entir, time, final, someon...
3         [pleas, speak, sexual, assault, interview]
4  [cant, keep, turn_blind, eye, pretend, isnt, r...


In [28]:
df.to_pickle('tokenized_tweets_stemmer.pkl')

In [14]:
df.to_excel('tokenized_tweets_stemmer.xlsx', index=False)