In [318]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Clean Data

In [319]:
# Read the file into a dataframe.
df = pd.read_csv("depression.csv")

### Drop Unnecessary Columns

##### Only columns relevant for exploring the underlying themes of the tweets and how these themes correlate with depression are kept.

In [320]:
df = df.drop(columns=['user_id', 'followers', 'friends', 'favourites', 'statuses', 'retweets'])

In [321]:
df = pd.read_csv("depression.csv")

### Handle Missing Data

##### There was no missing data in this dataset.

In [322]:
# Print the number of NaN values in each column.
print("NaN values per column before data cleaning: ")
print(df.isnull().sum())

NaN values per column before data cleaning: 
Unnamed: 0      0
post_id         0
post_created    0
post_text       0
user_id         0
followers       0
friends         0
favourites      0
statuses        0
retweets        0
label           0
dtype: int64


### Handle Duplicated Data

##### There were no duplicated rows in this dataset.

In [323]:
 print(f"Number of duplicated rows before removal: {df.duplicated().sum()}")

Number of duplicated rows before removal: 0


### Convert Data Types

#### Convert Tweet Identifiers

In [324]:
# Rename the column from 'post_id' to 'id'.
df = df.rename(columns={'post_id': 'id'})

# Convert the identifiers to strings.
df['id'] = df['id'].astype(str)


#### Convert Dates

In [325]:
# Rename the column from 'post_created' to 'date'.
df = df.rename(columns={'post_created': 'date'})

# Convert the 'date' column to datetime format.
df['date'] = pd.to_datetime(df['date'], format="%a %b %d %H:%M:%S %z %Y")

# Extract the relevant datetime components.
df['weekday'] = df['date'].dt.strftime('%A')
df['month'] = df['date'].dt.strftime('%B')
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour

#### Convert Tweets

In [326]:
# Rename the column from 'post_text' to 'tweet'.
df = df.rename(columns={'post_text': 'tweet'})

# Convert the tweets to strings.
df['tweet'] = df['tweet'].astype(str)

print(df['tweet'])

0        It's just over 2 years since I was diagnosed w...
1        It's Sunday, I need a break, so I'm planning t...
2        Awake but tired. I need to sleep but my brain ...
3        RT @SewHQ: #Retro bears make perfect gifts and...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                A day without sunshine is like night.
19996    Boren's Laws: (1) When in charge, ponder. (2) ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor, but they were never ...
19999       Black holes are where God is dividing by zero.
Name: tweet, Length: 20000, dtype: object


## Clean Tweets

### Remove URLs

In [327]:
# Function to remove URLs from a tweet using regex.
def remove_urls(tweet):
    return re.sub(r'http\S+', '', tweet)

# Apply remove_urls to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_urls)

print(df['tweet'])

0        It's just over 2 years since I was diagnosed w...
1        It's Sunday, I need a break, so I'm planning t...
2        Awake but tired. I need to sleep but my brain ...
3        RT @SewHQ: #Retro bears make perfect gifts and...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                A day without sunshine is like night.
19996    Boren's Laws: (1) When in charge, ponder. (2) ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor, but they were never ...
19999       Black holes are where God is dividing by zero.
Name: tweet, Length: 20000, dtype: object


### Remove Mentions

##### Remove @username mentions.

In [328]:
# Function to remove mentions from a tweet using regex.
def remove_mentions(tweet):
    return re.sub(r'@\w+', '', tweet)

# Apply remove_mentions to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_mentions)

print(df['tweet'])

0        It's just over 2 years since I was diagnosed w...
1        It's Sunday, I need a break, so I'm planning t...
2        Awake but tired. I need to sleep but my brain ...
3        RT : #Retro bears make perfect gifts and are g...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                A day without sunshine is like night.
19996    Boren's Laws: (1) When in charge, ponder. (2) ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor, but they were never ...
19999       Black holes are where God is dividing by zero.
Name: tweet, Length: 20000, dtype: object


### Remove Punctuation

In [329]:
# Function to remove punctuation from a tweet.
def remove_punctuation(tweet):
    return tweet.translate(str.maketrans('', '', string.punctuation))

# Apply remove_punctuation to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_punctuation)

print(df['tweet'])

0        Its just over 2 years since I was diagnosed wi...
1        Its Sunday I need a break so Im planning to sp...
2        Awake but tired I need to sleep but my brain h...
3        RT  Retro bears make perfect gifts and are gre...
4        It’s hard to say whether packing lists are mak...
                               ...                        
19995                 A day without sunshine is like night
19996    Borens Laws 1 When in charge ponder 2 When in ...
19997    The flow chart is a most thoroughly oversold p...
19998    Ships are safe in harbor but they were never m...
19999        Black holes are where God is dividing by zero
Name: tweet, Length: 20000, dtype: object


### Remove Stop Words

In [330]:
# Download stopwords.
nltk.download('stopwords')

# Load the stopwords from nltk.
stop_words = set(stopwords.words('english'))

# Function to remove stop words from a tweet.
def remove_stop_words(tweet):
    return ' '.join([word for word in tweet.split() if word not in stop_words])

# Apply remove_stop_words to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_stop_words)

print(df['tweet'])

0        Its 2 years since I diagnosed anxiety depressi...
1        Its Sunday I need break Im planning spend litt...
2                     Awake tired I need sleep brain ideas
3        RT Retro bears make perfect gifts great beginn...
4        It’s hard say whether packing lists making lif...
                               ...                        
19995                    A day without sunshine like night
19996    Borens Laws 1 When charge ponder 2 When troubl...
19997    The flow chart thoroughly oversold piece progr...
19998                   Ships safe harbor never meant stay
19999                        Black holes God dividing zero
Name: tweet, Length: 20000, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Remove Extra Spaces

In [331]:
# Function to remove extra spaces from a tweet with regex.
def remove_extra_spaces(tweet):
    return re.sub(r'\s+', ' ', tweet).strip()

# Apply remove_extra_spaces to 'tweet' column.
df['tweet'] = df['tweet'].apply(remove_extra_spaces)

print(df['tweet'])

0        Its 2 years since I diagnosed anxiety depressi...
1        Its Sunday I need break Im planning spend litt...
2                     Awake tired I need sleep brain ideas
3        RT Retro bears make perfect gifts great beginn...
4        It’s hard say whether packing lists making lif...
                               ...                        
19995                    A day without sunshine like night
19996    Borens Laws 1 When charge ponder 2 When troubl...
19997    The flow chart thoroughly oversold piece progr...
19998                   Ships safe harbor never meant stay
19999                        Black holes God dividing zero
Name: tweet, Length: 20000, dtype: object


### Convert Text to Lowercase

In [332]:
# Convert the 'tweets' column to lowercase.
df['tweet'] = df['tweet'].str.lower()

print(df['tweet'])

0        its 2 years since i diagnosed anxiety depressi...
1        its sunday i need break im planning spend litt...
2                     awake tired i need sleep brain ideas
3        rt retro bears make perfect gifts great beginn...
4        it’s hard say whether packing lists making lif...
                               ...                        
19995                    a day without sunshine like night
19996    borens laws 1 when charge ponder 2 when troubl...
19997    the flow chart thoroughly oversold piece progr...
19998                   ships safe harbor never meant stay
19999                        black holes god dividing zero
Name: tweet, Length: 20000, dtype: object


### Lemmatise Words

In [333]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/melodyflavel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [334]:
# Initialise lemmatiser.
lemmatiser = WordNetLemmatizer()

#### Lemmatise Nouns

In [335]:
# Function for lemmatising nouns in a tweet.
def lemmatise_nouns(tweet):
    # Tokenise the tweet into individual words.
    words = word_tokenize(tweet)
    # Lemmatise each noun in the tweet.
    lemmatised_words = [lemmatiser.lemmatize(word) for word in words]
    # Join the lemmatised words back into a string.
    return " ".join(lemmatised_words)

# Apply lemmatise_nouns to 'tweet' column.
df['tweet'] = df['tweet'].apply(lemmatise_nouns)

print(df['tweet'])

0        it 2 year since i diagnosed anxiety depression...
1        it sunday i need break im planning spend littl...
2                      awake tired i need sleep brain idea
3        rt retro bear make perfect gift great beginner...
4        it ’ s hard say whether packing list making li...
                               ...                        
19995                    a day without sunshine like night
19996    borens law 1 when charge ponder 2 when trouble...
19997    the flow chart thoroughly oversold piece progr...
19998                    ship safe harbor never meant stay
19999                         black hole god dividing zero
Name: tweet, Length: 20000, dtype: object


#### Lemmatise Verbs

The WordNetLemmatizer assumes all words are nouns. To get more accurate lemmatisation, such that verbs and adjectives are also lemmatised, part-of-speech (POS) tags can be used to specify the correct word category, allowing the lemmatiser to reduce words to their base forms based on their actual usage in the sentence. For example, 'running' is lemmatised to 'run', and 'better' to 'good'.

In [None]:
# Function for lemmatising words in a tweet with POS.
def lemmatise_words_with_pos(tweet):
    # Tokenise the tweet into individual words.
    words = word_tokenize(tweet)
    # Get POS tags for each word.
    pos_tags = nltk.pos_tag(words)
    
    # Map POS tags to WordNet POS tags.
    def get_wordnet_pos(tag):
        if tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    # Lemmatise based on POS tags.
    lemmatised_words = [lemmatiser.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

    # Join the lemmatised words back into a string.
    return " ".join(lemmatised_words)

# Apply lemmatise_words_with_pos to 'tweet' column.
df['tweet'] = df['tweet'].apply(lemmatise_words_with_pos)

print(df['tweet'])

In [None]:
print(f'pandas version: {pd.__version__}')
print(f'nltk version: {nltk.__version__}')
#print(f'stopwords version: {stopwords.__name__}')  # Stopwords is part of nltk, version is same as nltk
print(f're version: {re.__version__}')  # 're' doesn't have a version number, so skip this
print(f'string version: {string.__name__}')  # 'string' doesn't have a version number, so skip this
print(f'WordNetLemmatizer version: {nltk.stem.WordNetLemmatizer.__module__}')  # No version for lemmatizer specifically, same as nltk
#print(f'word_tokenize version: {nltk.tokenize.__version__}')
#print(f'wordnet version: {wordnet.__version__}')

In [None]:
# Save the cleaned data to depression_cleaned.csv.
df.to_csv('depression_cleaned.csv', index=False)