In [24]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Clean Data

In [25]:
# Read the file into a dataframe.
df = pd.read_csv("depression.csv", index_col=0)

df.head(5)

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1


## Drop Unnecessary Columns

##### Only columns relevant for exploring the underlying themes of the tweets and how these themes correlate with depression are kept.

In [26]:
# Drop unnecessaru columns.
df = df.drop(columns=['user_id', 'followers', 'friends', 'favourites', 'statuses', 'retweets'])
# Remove the index column.
df.reset_index(drop=True, inplace=True)

df.head(5)

Unnamed: 0,post_id,post_created,post_text,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1


## Handle Duplicated Data

In [27]:
 print(f"Number of duplicated rows before removal: {df.duplicated().sum()}")

Number of duplicated rows before removal: 119


In [28]:
# Remove the duplicated rows.
df.duplicated() 
df = df.drop_duplicates()

In [29]:
# Confirm that the duplicated rows have been removed.
print(f"Number of duplicated rows after removal: {df.duplicated().sum()}")

Number of duplicated rows after removal: 0


## Convert Data Types

### Convert Tweet Identifiers

In [30]:
# Rename the column from 'post_id' to 'id'.
df = df.rename(columns={'post_id': 'id'})

# Convert the identifiers to strings.
df['id'] = df['id'].astype(str)


### Convert Dates

In [31]:
# Rename the column from 'post_created' to 'date'.
df = df.rename(columns={'post_created': 'date'})

# Convert the 'date' column to datetime format.
df['date'] = pd.to_datetime(df['date'], format="%a %b %d %H:%M:%S %z %Y")

# Extract the relevant datetime components.
df['weekday'] = df['date'].dt.strftime('%A')
df['month'] = df['date'].dt.strftime('%B')
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour

### Convert Tweets

In [32]:
# Rename the column from 'post_text' to 'tweet'.
df = df.rename(columns={'post_text': 'tweet'})

# Convert the tweets to strings.
df['tweet'] = df['tweet'].astype(str)

print(df['tweet'])

0        It's just over 2 years since I was diagnosed w...
1        It's Sunday, I need a break, so I'm planning t...
2        Awake but tired. I need to sleep but my brain ...
3        RT @SewHQ: #Retro bears make perfect gifts and...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                A day without sunshine is like night.
19996    Boren's Laws: (1) When in charge, ponder. (2) ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor, but they were never ...
19999       Black holes are where God is dividing by zero.
Name: tweet, Length: 19881, dtype: object


## Clean Tweets

### Remove URLs

In [33]:
# Function to remove URLs from a tweet using regex.
def remove_urls(tweet):
    return re.sub(r'http\S+', '', tweet)

# Apply remove_urls to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_urls)

print(df['tweet'])

0        It's just over 2 years since I was diagnosed w...
1        It's Sunday, I need a break, so I'm planning t...
2        Awake but tired. I need to sleep but my brain ...
3        RT @SewHQ: #Retro bears make perfect gifts and...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                A day without sunshine is like night.
19996    Boren's Laws: (1) When in charge, ponder. (2) ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor, but they were never ...
19999       Black holes are where God is dividing by zero.
Name: tweet, Length: 19881, dtype: object


### Remove Mentions

##### Remove @username mentions.

In [34]:
# Function to remove mentions from a tweet using regex.
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

# Apply remove_mentions to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_mentions)

print(df['tweet'])

0        It's just over 2 years since I was diagnosed w...
1        It's Sunday, I need a break, so I'm planning t...
2        Awake but tired. I need to sleep but my brain ...
3        RT : #Retro bears make perfect gifts and are g...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                A day without sunshine is like night.
19996    Boren's Laws: (1) When in charge, ponder. (2) ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor, but they were never ...
19999       Black holes are where God is dividing by zero.
Name: tweet, Length: 19881, dtype: object


### Remove Punctuation

In [35]:
# Function to remove punctuation from a tweet.
def remove_punctuation(tweet):
    return tweet.translate(str.maketrans('', '', string.punctuation))

# Apply remove_punctuation to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_punctuation)

print(df['tweet'])

0        Its just over 2 years since I was diagnosed wi...
1        Its Sunday I need a break so Im planning to sp...
2        Awake but tired I need to sleep but my brain h...
3        RT  Retro bears make perfect gifts and are gre...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                 A day without sunshine is like night
19996    Borens Laws 1 When in charge ponder 2 When in ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor but they were never m...
19999        Black holes are where God is dividing by zero
Name: tweet, Length: 19881, dtype: object


### Remove Extra Spaces

In [36]:
# Function to remove extra spaces from a tweet with regex.
def remove_extra_spaces(tweet):
    return re.sub(r'\s+', ' ', tweet).strip()

# Apply remove_extra_spaces to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_extra_spaces)

print(df['tweet'])

0        Its just over 2 years since I was diagnosed wi...
1        Its Sunday I need a break so Im planning to sp...
2        Awake but tired I need to sleep but my brain h...
3        RT Retro bears make perfect gifts and are grea...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                 A day without sunshine is like night
19996    Borens Laws 1 When in charge ponder 2 When in ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor but they were never m...
19999        Black holes are where God is dividing by zero
Name: tweet, Length: 19881, dtype: object


### Convert Text to Lowercase

In [37]:
# Convert the 'tweets' column to lowercase.
df['tweet'] = df['tweet'].str.lower()

print(df['tweet'])

0        its just over 2 years since i was diagnosed wi...
1        its sunday i need a break so im planning to sp...
2        awake but tired i need to sleep but my brain h...
3        rt retro bears make perfect gifts and are grea...
4        it’s hard to say whether packing lists are mak...
                               ...                        
19995                 a day without sunshine is like night
19996    borens laws 1 when in charge ponder 2 when in ...
19997    the flow chart is a most thoroughly oversold p...
19998    ships are safe in harbor but they were never m...
19999        black holes are where god is dividing by zero
Name: tweet, Length: 19881, dtype: object


### Lemmatise Words

In [38]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [39]:
# Initialise lemmatiser.
lemmatiser = WordNetLemmatizer()

#### Lemmatise Nouns

In [40]:
# Function for lemmatising nouns in a tweet.
def lemmatise_nouns(tweet):
    # Tokenise the tweet into individual words.
    words = word_tokenize(tweet)
    # Lemmatise each noun in the tweet.
    lemmatised_words = [lemmatiser.lemmatize(word) for word in words]
    # Join the lemmatised words back into a string.
    return " ".join(lemmatised_words)

# Apply lemmatise_nouns to 'tweet' column.
df['tweet'] = df['tweet'].apply(lemmatise_nouns)

print(df['tweet'])

0        it just over 2 year since i wa diagnosed with ...
1        it sunday i need a break so im planning to spe...
2        awake but tired i need to sleep but my brain h...
3        rt retro bear make perfect gift and are great ...
4        it ’ s hard to say whether packing list are ma...
                               ...                        
19995                 a day without sunshine is like night
19996    borens law 1 when in charge ponder 2 when in t...
19997    the flow chart is a most thoroughly oversold p...
19998    ship are safe in harbor but they were never me...
19999         black hole are where god is dividing by zero
Name: tweet, Length: 19881, dtype: object


#### Lemmatise Verbs

The WordNetLemmatizer assumes all words are nouns. To get more accurate lemmatisation, such that verbs and adjectives are also lemmatised, part-of-speech (POS) tags can be used to specify the correct word category, allowing the lemmatiser to reduce words to their base forms based on their actual usage in the sentence. For example, 'running' is lemmatised to 'run', and 'better' to 'good'.

In [41]:
# Function for lemmatising words in a tweet with POS.
def lemmatise_words_with_pos(tweet):
    # Tokenise the tweet into individual words.
    words = word_tokenize(tweet)
    # Get POS tags for each word.
    pos_tags = nltk.pos_tag(words)
    
    # Map POS tags to WordNet POS tags.
    def get_wordnet_pos(tag):
        if tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    # Lemmatise based on POS tags.
    lemmatised_words = [lemmatiser.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

    # Join the lemmatised words back into a string.
    return " ".join(lemmatised_words)

# Apply lemmatise_words_with_pos to 'tweet' column.
df['tweet'] = df['tweet'].apply(lemmatise_words_with_pos)

print(df['tweet'])

0        it just over 2 year since i wa diagnose with a...
1        it sunday i need a break so im planning to spe...
2        awake but tire i need to sleep but my brain ha...
3        rt retro bear make perfect gift and be great f...
4        it ’ s hard to say whether pack list be make l...
                               ...                        
19995                 a day without sunshine be like night
19996    borens law 1 when in charge ponder 2 when in t...
19997    the flow chart be a most thoroughly oversold p...
19998    ship be safe in harbor but they be never mean ...
19999            black hole be where god be divide by zero
Name: tweet, Length: 19881, dtype: object


### Remove Stop Words

In [42]:
# Download stopwords.
nltk.download('stopwords')

# Load the stopwords from nltk.
stop_words = set(stopwords.words('english'))

# Add additional stopwords.
additional_stopwords = {
    
    # Common verbs.
    'go', 'get', 'wait', 'may', 'find', 'say', 'thank', 'see', 'could', 'would', 'like', 'look', 'know', 'try', 'make', 'talk',
    'stop', 'let', 'watch', 'keep', 'happen', 'take', 'wear', 'call', 'come', 'tell', 'stay', 'move', 'give', 'check',
    
    
    # Common nouns.
    'week', 'way', 'time', 'morning', 'day', 'year', 'today', 'anything', 'something', 'end', 'thing', 'mind', 'one', 'part', 'name',

    # Common pronouns.
    'he', 'she',

    # Common adjectives and adverbs.
    'much', 'still', 'new', 'back', 'real', 'anytime', 'even', 'right', 'also', 'cool', 'already', 'ever', 'sure', 'long', 'pretty', 
    'since', 'old', 'first', 'really', 'around', 'two',

    # Common interjections.
    'thanks', 'hey', 'hello', 'wow', 'oh', 'ok', 'well', 'yeah', 'actually', 'maybe', 'yes', 'no', 'please', 'haha',

    # Contractions and shorthands.
    'cant', 'via', 'thats', 'youre', 'dont', 'theyre', 'shes', 'doesnt', 'didnt', 'whats',
    
    # Twitter-specific words.
    'rt', 'follow', 'twitter', 'tweet',
    
    # Undefined words.
    'amp', 'lt3', 'yong', 'na', 'gon', 'wan', 'wearepayting', 'foryong', 'bestmusicvideo', 'pillowtalk', 'joe', 'gt', 'lol', 'omg', 
    'paytforluckysun', 'michael'
}
stop_words.update(additional_stopwords)

# Function to remove stop words from a tweet.
def remove_stop_words(tweet):
    # Tokenise the text.
    tokens = word_tokenize(tweet)
    
    # Remove stopwords.
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    # Return the cleaned text.
    return ' '.join(tokens)
    #return ' '.join([word for word in tweet.split() if word not in stop_words])

# Apply remove_stop_words to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_stop_words)

print(df['tweet'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        diagnose anxiety depression moment reflect far...
1        sunday need break planning spend little possib...
2                         awake tire need sleep brain idea
3        retro bear perfect gift great beginner stitch ...
4        hard whether pack list life easier reinforce n...
                               ...                        
19995                               without sunshine night
19996    borens law charge ponder trouble delegate doub...
19997    flow chart thoroughly oversold piece program d...
19998                          ship safe harbor never mean
19999                           black hole god divide zero
Name: tweet, Length: 19881, dtype: object


## Handle Missing Data

##### There was no missing data in this dataset at this point in the processing.

In [43]:
# Print the number of NaN values in each column.
print("NaN values per column before data cleaning: ")
print(df.isnull().sum())

NaN values per column before data cleaning: 
id         0
date       0
tweet      0
label      0
weekday    0
month      0
year       0
day        0
hour       0
dtype: int64


In [44]:
# Save the cleaned data to depression_cleaned.csv.
df.to_csv('depression_cleaned.csv', index=False)