In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import string

### Loading Data for Preprocessing

In [8]:
#Read File
df=pd.read_excel('tweets/tweets.xlsx', index_col=0)
df = pd.DataFrame(df.copy(), columns = ['id', 'text','label'])
df.head()

Unnamed: 0,id,text,label
0,1462903753649053703,Why should the #EMU only be for epilepsy? We r...,1
1,1462903869529284608,And yet I listen to a youth orchestra and they...,1
2,1462903886000361483,"Your #thoughts create ur reality, including ur...",1
3,1462903890639302660,5 Signs of toxic overthinking 🚩\nDid you know ...,1
4,1462903928174088192,#MentalHealthMonday reminder - if you are expe...,1


### Pre-proccesing Tweets

Pre-processing steps we do:
1. Remove any duplicated tweets by looking at their tweet_ids
2. Tokenize using the NLTK TweetTokenizer
3. Remove punctuation
4. Remove stopwords
4. Remove the 'RT' symbol
5. Remove URLs
6. Remove the hashtag symbol. Keep the word of the hashtag though

In [9]:
def deduplicate_and_tokenize(df):
    
    #removing any duplicate tweets. Looking at the tweet_id
    df = df.drop_duplicates(subset='id', keep='first')
    
    #tokenizing, converting everything to lowercase, and reducing length of repeated chars ('Baaaaad' becomes 'Baaad')
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
    df['tokenized_text'] = df.apply(lambda row: tokenizer.tokenize(row['text']), axis=1)
    return df

In [10]:
def preprocess(tokens, ignore_urls=True, ignore_rt_char=True, ignore_hashtags=True,
              ignore_mentions=True, ignore_numbers=True,remove_stopwords=True, ignore_punctuation=True, min_tweet_length=1):
    
    cleaned_tokens = []
    num_of_hashtags = 0
    punctuations = list(string.punctuation)
    sword = stopwords.words('english')
    
    for token in tokens:  
        if remove_stopwords and token in sword: # ignore stopword
                 continue
        if ignore_urls and ( token.startswith('https:') or token.startswith('http:') ): #ignoring URLs
            continue
        if ignore_rt_char and token == 'rt': # ignore 'retweet' symbol
             continue
        if ignore_hashtags and token.startswith('#'): # ignore hashtags
            token=token.replace('#','')
            num_of_hashtags+=1
        if ignore_mentions and token.startswith('@'): # ignore mentions
             continue
        if ignore_numbers and token.isnumeric(): #ignore numbers
            continue
        if ignore_punctuation and token in punctuations:
            continue
        
        cleaned_tokens.append(token)
        
    #tweets should a minimum length
    #tweets should also have some words that are not hashtags
    if (len(cleaned_tokens) < min_tweet_length) or ((len(cleaned_tokens) - num_of_hashtags) < min_tweet_length):
        return np.nan
    else:
        return cleaned_tokens

In [11]:
tweets = deduplicate_and_tokenize(df)
processed_text = pd.Series([preprocess(tokens) for tokens in tweets.tokenized_text])
tweets['processed_text'] = processed_text

In [12]:
tweets.head()

Unnamed: 0,id,text,label,tokenized_text,processed_text
0,1462903753649053703,Why should the #EMU only be for epilepsy? We r...,1,"[why, should, the, #emu, only, be, for, epilep...","[emu, epilepsy, report, complete, remission, d..."
1,1462903869529284608,And yet I listen to a youth orchestra and they...,1,"[and, yet, i, listen, to, a, youth, orchestra,...","[yet, listen, youth, orchestra, shyte, beat, t..."
2,1462903886000361483,"Your #thoughts create ur reality, including ur...",1,"[your, #thoughts, create, ur, reality, ,, incl...","[thoughts, create, ur, reality, including, ur,..."
3,1462903890639302660,5 Signs of toxic overthinking 🚩\nDid you know ...,1,"[5, signs, of, toxic, overthinking, 🚩, did, yo...","[signs, toxic, overthinking, 🚩, know, toxic, o..."
4,1462903928174088192,#MentalHealthMonday reminder - if you are expe...,1,"[#mentalhealthmonday, reminder, -, if, you, ar...","[mentalhealthmonday, reminder, experiencing, a..."


In [13]:
#Removing empty strings if any
print(tweets.shape)
tweets = tweets.dropna()
print(tweets.shape)

(18800, 5)
(15566, 5)


### Saving pre-processed tweets to CSV

Question for the group - in what format should we save the emojis?
Creating a CSV from Pandas replaces the emoji with some unicode characters, but creating an Excel file keeps them.

In [26]:
tweets.to_csv('tweets/tweets_preprocessed.csv', index=None, header=True)

### Reading the pre-proccesed tweets

In [16]:
import pandas as pd
df = pd.read_csv('tweets/tweets_preprocessed.csv')
df.head()

Unnamed: 0,id,text,label,tokenized_text,processed_text
0,1462903753649053703,Why should the #EMU only be for epilepsy? We r...,1,"['why', 'should', 'the', '#emu', 'only', 'be',...","['emu', 'epilepsy', 'report', 'complete', 'rem..."
1,1462903869529284608,And yet I listen to a youth orchestra and they...,1,"['and', 'yet', 'i', 'listen', 'to', 'a', 'yout...","['yet', 'listen', 'youth', 'orchestra', 'shyte..."
2,1462903886000361483,"Your #thoughts create ur reality, including ur...",1,"['your', '#thoughts', 'create', 'ur', 'reality...","['thoughts', 'create', 'ur', 'reality', 'inclu..."
3,1462903890639302660,5 Signs of toxic overthinking 🚩\nDid you know ...,1,"['5', 'signs', 'of', 'toxic', 'overthinking', ...","['signs', 'toxic', 'overthinking', '🚩', 'know'..."
4,1462903928174088192,#MentalHealthMonday reminder - if you are expe...,1,"['#mentalhealthmonday', 'reminder', '-', 'if',...","['mentalhealthmonday', 'reminder', 'experienci..."


In [17]:
from ast import literal_eval
Sentences=[]
for i in df.index:
    a= literal_eval(df['processed_text'][i])
    Sentences.append(a)


In [18]:
Sentences

[['emu',
  'epilepsy',
  'report',
  'complete',
  'remission',
  'depression',
  '😥',
  'patient',
  'using',
  'novel',
  'paradigm',
  'advancing',
  'dbs',
  'brain',
  '🧠',
  'stimulation',
  'using',
  'personalized',
  'mapping',
  'brain',
  'networks',
  '...',
  'thread',
  '1/5'],
 ['yet',
  'listen',
  'youth',
  'orchestra',
  'shyte',
  'beat',
  'tune',
  'yet',
  '...',
  'yeah',
  'morose',
  'filled',
  'bitter',
  'melancholy',
  'much',
  'wrong',
  'world',
  'done',
  'nothing',
  'improve',
  'shame',
  'bipolar',
  'depression'],
 ['thoughts',
  'create',
  'ur',
  'reality',
  'including',
  'ur',
  'emotions',
  'mentalhealth',
  'want',
  'change',
  'way',
  'feel',
  'change',
  'think',
  'quickest',
  'way',
  'change',
  'feel',
  'change',
  'think',
  'depression',
  'anxiety',
  'brain',
  'monday'],
 ['signs',
  'toxic',
  'overthinking',
  '🚩',
  'know',
  'toxic',
  'overthinking',
  'may',
  'symptom',
  'depression',
  'anxiety',
  'flip',
  'sid