In [1]:
import pandas as pd

import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# notebook configurations
pd.options.display.max_colwidth = 1000

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
df = pd.read_csv("/content/sample_data/Suicide_Ideation_Dataset(Twitter-based).csv")

In [3]:
df

Unnamed: 0,Tweet,Suicide
0,making some lunch,Not Suicide post
1,@Alexia You want his money.,Not Suicide post
2,@dizzyhrvy that crap took me forever to put together. iâm going to go sleep for DAYS,Potential Suicide post
3,@jnaylor #kiwitweets Hey Jer! Since when did you start twittering?,Not Suicide post
4,Trying out &quot;Delicious Library 2&quot; with mixed results The bar code thought I wanted to add a sport bra instead of a drill Cool app tho!,Not Suicide post
...,...,...
1782,i have forgotten how much i love my Nokia N95-1,Not Suicide post
1783,Starting my day out with a positive attitude! To be great watch greatness!,Not Suicide post
1784,"@belledame222 Hey, it's 5 am...give a girl some credit for trying.",Not Suicide post
1785,2 drunken besties stumble into my room and we run around with sober CJ and drunk Hope knocking on doors. Good times at 3am.,Not Suicide post


In [4]:
df.head(5)

Unnamed: 0,Tweet,Suicide
0,making some lunch,Not Suicide post
1,@Alexia You want his money.,Not Suicide post
2,@dizzyhrvy that crap took me forever to put together. iâm going to go sleep for DAYS,Potential Suicide post
3,@jnaylor #kiwitweets Hey Jer! Since when did you start twittering?,Not Suicide post
4,Trying out &quot;Delicious Library 2&quot; with mixed results The bar code thought I wanted to add a sport bra instead of a drill Cool app tho!,Not Suicide post


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1787 entries, 0 to 1786
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Tweet    1785 non-null   object
 1   Suicide  1787 non-null   object
dtypes: object(2)
memory usage: 28.0+ KB


## Drop empty rows

In our case the rows without a Tweet entry do not have a significant meaning, hence we may drop them.

In [6]:
df[df["Tweet"].isnull()]

Unnamed: 0,Tweet,Suicide
497,,Potential Suicide post
1017,,Not Suicide post


In [7]:
df.dropna()

Unnamed: 0,Tweet,Suicide
0,making some lunch,Not Suicide post
1,@Alexia You want his money.,Not Suicide post
2,@dizzyhrvy that crap took me forever to put together. iâm going to go sleep for DAYS,Potential Suicide post
3,@jnaylor #kiwitweets Hey Jer! Since when did you start twittering?,Not Suicide post
4,Trying out &quot;Delicious Library 2&quot; with mixed results The bar code thought I wanted to add a sport bra instead of a drill Cool app tho!,Not Suicide post
...,...,...
1782,i have forgotten how much i love my Nokia N95-1,Not Suicide post
1783,Starting my day out with a positive attitude! To be great watch greatness!,Not Suicide post
1784,"@belledame222 Hey, it's 5 am...give a girl some credit for trying.",Not Suicide post
1785,2 drunken besties stumble into my room and we run around with sober CJ and drunk Hope knocking on doors. Good times at 3am.,Not Suicide post


### Remove punctuation

Removing punctuation in data cleaning is important for noise reduction, text standardization, tokenization, and improving language syntax. Punctuation marks usually do not carry significant meaning on their own and can introduce unnecessary noise in the text data. By removing punctuation, the text is standardized, making it easier for the model to process and analyze. Additionally, punctuation marks are typically treated as separate tokens during tokenization, and removing them helps create cleaner and more meaningful tokens.

In [8]:
df["Tweet"] = df["Tweet"].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", " ", str(x)))
df

Unnamed: 0,Tweet,Suicide
0,making some lunch,Not Suicide post
1,Alexia You want his money,Not Suicide post
2,dizzyhrvy that crap took me forever to put together i m going to go sleep for DAYS,Potential Suicide post
3,jnaylor kiwitweets Hey Jer Since when did you start twittering,Not Suicide post
4,Trying out quot Delicious Library 2 quot with mixed results The bar code thought I wanted to add a sport bra instead of a drill Cool app tho,Not Suicide post
...,...,...
1782,i have forgotten how much i love my Nokia N95 1,Not Suicide post
1783,Starting my day out with a positive attitude To be great watch greatness,Not Suicide post
1784,belledame222 Hey it s 5 am give a girl some credit for trying,Not Suicide post
1785,2 drunken besties stumble into my room and we run around with sober CJ and drunk Hope knocking on doors Good times at 3am,Not Suicide post


### Remove words with number

Removing words with numbers during data cleaning of product names eliminates numerical information that may not be relevant for the task at hand. Numerical values in product names often represent specific attributes, such as sizes or model numbers, which may not contribute to product understanding or classification. By removing these words, we focus on descriptive and discriminative features, simplifying the text representation for accurate predictions.


In [9]:
df["Tweet"] = df["Tweet"].apply(lambda x: ' '.join([word for word in x.split() if not re.search(r'\d', word)]))

In [10]:
df["Tweet"].head()

0                                                                                                                             making some lunch
1                                                                                                                     Alexia You want his money
2                                                            dizzyhrvy that crap took me forever to put together i m going to go sleep for DAYS
3                                                                                jnaylor kiwitweets Hey Jer Since when did you start twittering
4    Trying out quot Delicious Library quot with mixed results The bar code thought I wanted to add a sport bra instead of a drill Cool app tho
Name: Tweet, dtype: object

### Remove single string characters

Removing single-character strings during data cleaning is important as they often do not provide meaningful information and can introduce noise. By removing them, we can improve the quality of the text data and focus on more relevant words. This helps to reduce dimensionality, eliminate unnecessary noise, and improve the efficiency of subsequent text analysis tasks.

In [11]:
df["Tweet"] = df["Tweet"].str.replace(r'\b\w\b', "", regex = True)

### Lowercase

Lowercasing text during data cleaning in our project is important for standardization and consistency. It treats words with different cases as the same, reducing the vocabulary size and improving performance in subsequent tasks. Lowercasing ensures that words like ```Hello``` and ```hello``` are represented uniformly, making it easier to compare and analyze the text data.

In [12]:
df["Tweet"] = df["Tweet"].str.lower()

### Remove stop words

Stop words are commonly used words such as "a," "the," "is," which do not carry significant meaning and can introduce noise to the analysis. By removing stop words, we can reduce the dimensionality of the data and focus on more informative and content-rich words.

In [13]:
def remove_stop_words(input_string):
    # Tokenize the string
    tokens = word_tokenize(input_string)

    # Get the list of English stop words
    stop_words = set(stopwords.words("english"))

    # Remove stop words
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Reconstruct the string
    output_string = " ".join(filtered_tokens)

    return output_string

In [14]:
# Apply to column "name"
df["Tweet"] = df["Tweet"].apply(remove_stop_words)

In [15]:
df.Tweet

0                                                                                                            making lunch
1                                                                                                       alexia want money
2                                                            dizzyhrvy crap took forever put together going go sleep days
3                                                                       jnaylor kiwitweets hey jer since start twittering
4       trying quot delicious library quot mixed results bar code thought wanted add sport bra instead drill cool app tho
                                                              ...                                                        
1782                                                                                            forgotten much love nokia
1783                                                                 starting day positive attitude great watch greatness
1784                    

### Part-of-Speech (POS) Tagging and Lemmatize the words

POS tagging identifies the grammatical category of each word, while lemmatization reduces words to their base form. By using POS tags, we accurately lemmatize words, ensuring consistent representation across different grammatical forms.

In [16]:
def get_wordnet_pos(tag):
    if tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("J"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
lemmatizer = WordNetLemmatizer()
sent = "kids smart watch for girls toy for kids gift for girls watches"

tagged_words = nltk.pos_tag(word_tokenize(sent))
for word, w_tag in tagged_words:
    lemma_tag = get_wordnet_pos(w_tag)
    lemma = lemmatizer.lemmatize(word, lemma_tag)
    print(f"{word} ({w_tag}) --> {lemma} ({lemma_tag})")

kids (NNS) --> kid (n)
smart (VBP) --> smart (v)
watch (NN) --> watch (n)
for (IN) --> for (n)
girls (NNS) --> girl (n)
toy (NN) --> toy (n)
for (IN) --> for (n)
kids (NNS) --> kid (n)
gift (NN) --> gift (n)
for (IN) --> for (n)
girls (NNS) --> girl (n)
watches (NNS) --> watch (n)


In [18]:
df["Tweet"] = df["Tweet"].apply(lambda x: " ".join([lemmatizer.lemmatize(word, pos = get_wordnet_pos(w_tag)) for word, w_tag in nltk.pos_tag(word_tokenize(x))]))

In [19]:
df_cleaned = df

In [20]:
df_cleaned.to_csv("/content/sample_data/Suicide_Ideation_Dataset(Twitter-based)_Cleaned.csv", index = False)