### Import LIbraries

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import emoji
import nltk
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.corpus import words


### Load Tweets for Preprocessing


In [2]:
df = pd.read_csv('metoo_tweets_dec2017.csv',usecols=[1])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398670 entries, 0 to 398669
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    395561 non-null  object
dtypes: object(1)
memory usage: 3.0+ MB
None


### Filter Tweets taht don't have #METOO tag

In [3]:
df = df.dropna(subset=['text']) # Drop NaN rows. 
#df['text'] = df['text'].str.lower() # Convert to lower case
df = df[df['text'].str.contains('metoo', case=False, na=False)]#df = df[df['text'].str.contains('#MeToo')] # Filter rows that have #metoo and drop others. 
print(df.head())
print(df.info())

                                                text
0    American Harem.. #MeToo https://t.co/HjExLJdGuF
1  @johnconyersjr  @alfranken  why have you guys ...
3  Women have been talking about this crap the en...
4  .@BetteMidler please speak to this sexual assa...
5  We can't keep turning a blind eye and pretend ...
<class 'pandas.core.frame.DataFrame'>
Index: 338674 entries, 0 to 398669
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    338674 non-null  object
dtypes: object(1)
memory usage: 5.2+ MB
None


#### Preprocessing

In [4]:
def split_hashtag(hashtag):
    # Split based on capitalization patterns
    return ' '.join(re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?![a-z])', hashtag))

df = df.drop_duplicates(subset=['text']).reset_index(drop=True) # remove duplicates
df['text'] = df['text'].apply(lambda x: re.sub(r'#metoo', '', x, flags=re.IGNORECASE)) # Remove MeToo hashtag

df['text'] = df['text'].apply(lambda x:  # Find hashtags and split them into words
    ' '.join([split_hashtag(word[1:]) if word.startswith('#') else word for word in x.split()])
)
df['text'] = df['text'].str.lower() # lower case for easier processing


In [5]:
# Apply transformations to the 'text' column
df['text'] = df['text'].apply(lambda x: 
    emoji.demojize( # Convert emojis to words
        re.sub(r'[^a-zA-Z\s]', '', # REMOVE punctuation and numbers 
                      re.sub(r"\bmetoo\b", '', # REMOVE word metoo
                             re.sub(r"@\w+", '', # REMOVE mentions
                                    re.sub(r'http\S+|www\S+|https\S+', '',x) # REMOVE URLs
                                   
                            )
                     )
              )
    )
)

df['text'] = df['text'].apply(lambda x: re.sub(r'#', '', x)) # Remove hashtags

In [6]:
print(df.iloc[9])

text    black lives matter with yup ok
Name: 9, dtype: object


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138179 entries, 0 to 138178
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    138179 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
None


In [8]:
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113383 entries, 0 to 113382
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    113383 non-null  object
dtypes: object(1)
memory usage: 885.9+ KB
None


In [9]:
print(df.tail())

                                                     text
113378  rt  with the attacks on matt damon today the m...
113379                                          wait oops
113380  my bro claims is just a witchhunt and refuses ...
113381   need to start tweeting after this game ja xvs...
113382   say victims of sexual harassment in japan via...


### Further processing tweets for Topic Modeling

In [10]:
# Tokenizing the words
nltk.download('punkt_tab')
df['text'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
# Remove Stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
print(df.head())

                                                text
0                                  [american, harem]
1          [guys, resigned, yet, liberal, hypocrisy]
2  [women, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


In [39]:
# Lemmatization to put words in its origin
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.head())

                                                text
0                                  [american, harem]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


In [40]:
# Remove short words and infrequent words
df['text'] = df['text'].apply(lambda x: [word for word in x if len(word) > 2])
print(df.head())

                                                text
0                                  [american, harem]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


In [43]:
from collections import Counter

# Flatten the list of all tokenized words in the 'text' column
all_words = [word for tokens in df['text'] for word in tokens]
word_counts = Counter(all_words)

# Set a frequency threshold
min_freq = 10 

# Filter the words based on this minimum frequency
df['text'] = df['text'].apply(lambda x: [word for word in x if word_counts[word] >= min_freq])

print(df.head())

                                                text
0                                         [american]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind, eye, pretend, isn...


In [50]:
# Create N-Grams for words that don't make sense individually
sentences = df['text'].tolist() # convert tokens back to sentences
bigram = Phrases(sentences, min_count=5, threshold=100) # define phrases
bigram_mod = Phraser(bigram) # initialize Phraser with bigram settings. 
df['text'] = df['text'].apply(lambda x: bigram_mod[x])
print(df.head())

                                                text
0                                  [american, harem]
1           [guy, resigned, yet, liberal, hypocrisy]
2  [woman, talking, crap, entire, time, finally, ...
3        [please, speak, sexual, assault, interview]
4  [cant, keep, turning, blind_eye, pretend, isnt...


### Save the preprocessed and tokenized words

In [44]:
df.to_pickle('tokenized_tweets_lemmatzation.pkl')

### PORTER STEMMER 

In [12]:

stemmer = PorterStemmer() # Initialize Stemmer

df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])  #Stemming every word

print(df.head())

                                                text
0                                  [american, harem]
1               [guy, resign, yet, liber, hypocrisi]
2  [women, talk, crap, entir, time, final, someon...
3         [pleas, speak, sexual, assault, interview]
4  [cant, keep, turn, blind, eye, pretend, isnt, ...


In [13]:
# Remove short words and infrequent words
df['text'] = df['text'].apply(lambda x: [word for word in x if len(word) > 2])
print(df.head())

                                                text
0                                  [american, harem]
1               [guy, resign, yet, liber, hypocrisi]
2  [women, talk, crap, entir, time, final, someon...
3         [pleas, speak, sexual, assault, interview]
4  [cant, keep, turn, blind, eye, pretend, isnt, ...


In [66]:
# Create N-Grams for words that don't make sense individually
sentences = df['text'].tolist() # convert tokens back to sentences
bigram = Phrases(sentences, min_count=5, threshold=100) # define phrases
bigram_mod = Phraser(bigram) # initialize Phraser with bigram settings. 
df['text'] = df['text'].apply(lambda x: bigram_mod[x])
print(df.head())

                                                text
0                                  [american, harem]
1               [guy, resign, yet, liber, hypocrisi]
2  [women, talk, crap, entir, time, final, someon...
3         [pleas, speak, sexual, assault, interview]
4  [cant, keep, turn_blind, eye, pretend, isnt, r...


In [28]:
df.to_pickle('tokenized_tweets_stemmer.pkl')

In [14]:
df.to_excel('tokenized_tweets_stemmer.xlsx', index=False)