### Import LIbraries

In [79]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import emoji
import nltk
from gensim.models import Phrases
from gensim.models.phrases import Phraser

### Load Tweets for Preprocessing


In [62]:
df = pd.read_csv('metoo_tweets_dec2017.csv',usecols=[1])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398670 entries, 0 to 398669
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    395561 non-null  object
dtypes: object(1)
memory usage: 3.0+ MB
None


### Filter Tweets taht don't have #METOO tag

In [63]:
df = df.dropna(subset=['text']) # Drop NaN rows. 
df['text'] = df['text'].str.lower() # Convert to lower case
df = df[df['text'].str.contains('#metoo')] # Filter rows that have #metoo and drop others. 
print(df.head())
print(df.info())

                                                text
0    american harem.. #metoo https://t.co/hjexljdguf
1  @johnconyersjr  @alfranken  why have you guys ...
3  women have been talking about this crap the en...
4  .@bettemidler please speak to this sexual assa...
5  we can't keep turning a blind eye and pretend ...
<class 'pandas.core.frame.DataFrame'>
Index: 338554 entries, 0 to 398669
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    338554 non-null  object
dtypes: object(1)
memory usage: 5.2+ MB
None


#### Preprocessing

In [65]:
df['text'] = df['text'].apply(lambda x: 
    emoji.demojize( # Convert emojis to words
        re.sub(r'[^a-zA-Z\s]', '', # REMOVE punctuation and numbers
               re.sub(r"#metoo", '', # REMOVE #metoo hashtag 
                      re.sub(r"#", '', # REMOVE other hashtags but keep words
                             re.sub(r"@\w+", '', # REMOVE mentions
                                    re.sub(r'http\S+|www\S+|https\S+', '', x) # REMOVE URLs
                                   )
                            )
                     )
              )
    )
)
print(df.head())



                                                text
0                              american harem metoo 
1      why have you guys not resigned yet liberal...
3  women have been talking about this crap the en...
4   please speak to this sexual assault by  durin...
5  we cant keep turning a blind eye and pretend t...


### Further processing tweets for Topic Modeling

In [72]:
# Tokenizing the words
nltk.download('punkt_tab')
df['text'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [74]:
# Remove Stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
print(df.head())

                                                text
0                           [american, harem, metoo]
1   [guys, resigned, yet, liberal, hypocrisy, metoo]
3  [women, talking, crap, entire, time, finally, ...
4  [please, speak, sexual, assault, interview, me...
5  [cant, keep, turning, blind, eye, pretend, isn...


In [75]:
# Lemmatization to put words in its origin
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.head())

                                                text
0                           [american, harem, metoo]
1    [guy, resigned, yet, liberal, hypocrisy, metoo]
3  [woman, talking, crap, entire, time, finally, ...
4  [please, speak, sexual, assault, interview, me...
5  [cant, keep, turning, blind, eye, pretend, isn...


In [77]:
# Remove short words and infrequent words
df['text'] = df['text'].apply(lambda x: [word for word in x if len(word) > 2])
print(df.head())

                                                text
0                           [american, harem, metoo]
1    [guy, resigned, yet, liberal, hypocrisy, metoo]
3  [woman, talking, crap, entire, time, finally, ...
4  [please, speak, sexual, assault, interview, me...
5  [cant, keep, turning, blind, eye, pretend, isn...


In [80]:
# Create N-Grams for words that don't make sense individually
sentences = df['text'].tolist() # convert tokens back to sentences
bigram = Phrases(sentences, min_count=5, threshold=100) # define phrases
bigram_mod = Phraser(bigram) # initialize Phraser with bigram settings. 
df['text'] = df['text'].apply(lambda x: bigram_mod[x])
print(df.head())

                                                text
0                           [american, harem, metoo]
1    [guy, resigned, yet, liberal, hypocrisy, metoo]
3  [woman, talking, crap, entire, time, finally, ...
4  [please, speak, sexual, assault, interview, me...
5  [cant, keep, turning, blind_eye, pretend, isnt...


### Save the preprocessed and tokenized words

In [81]:
df.to_pickle('tokenized_tweets.pkl')