In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Load and preprocess data
dataset_dir = os.path.join('..', 'Dataset')
data_path = os.path.join(dataset_dir, 'Suicide_Detection.csv')

data = pd.read_csv(data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [3]:
data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
  data['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)


Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,1
1,3,Am I weird I don't get affected by compliments...,0
2,4,Finally 2020 is almost over... So I can never ...,0
3,8,i need helpjust help me im crying so hard,1
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1


In [4]:
texts = data['text'].values
labels = data['class'].values

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [7]:
def process_text(text):
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize and remove stop words, apply lemmatization
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]
    
    # Join tokens back into a string
    processed_text = " ".join(tokens)
    
    return processed_text

# def process_text_data(text_data):
#     # Check if text_data is a numpy array
#     if isinstance(text_data, np.ndarray):
#         processed_data = np.array([process_text(text) for text in text_data])
#     else:
#         raise TypeError("Input should be a numpy.ndarray")
#     return processed_data


In [8]:
text_data = texts.tolist()

# processed_data = process_text_data(text_data)
processed_data = [process_text(text) for text in text_data]
# for original, processed in zip(text_data[:2], processed_data[:2]):
#     print("Original:", original)
#     print("Processed:", processed)
#     print()

In [9]:
text_data[-1]

"I still haven't beaten the first boss in Hollow Knight. I've only fought it a few times and I always die really early in the fight. I'm terrible at this game y'all. :("

In [10]:
processed_data[-1]

'still havent beaten first bos hollow knight ive fought time always die really early fight im terrible game yall'

In [11]:
data['process_text'] = processed_data
data.head()

Unnamed: 0.1,Unnamed: 0,text,class,process_text
0,2,Ex Wife Threatening SuicideRecently I left my ...,1,ex wife threatening suiciderecently left wife ...
1,3,Am I weird I don't get affected by compliments...,0,weird dont get affected compliment coming some...
2,4,Finally 2020 is almost over... So I can never ...,0,finally 2020 almost never hear 2020 bad year e...
3,8,i need helpjust help me im crying so hard,1,need helpjust help im cry hard
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",1,im losthello name adam 16 ive struggling year ...


In [12]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Download NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_pe

In [13]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data['process_text'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# tfidf_df

In [15]:
# # POS Tagging
# pos_tagged_data = [pos_tag(word_tokenize(text)) for text in processed_data]
# pos_tagged_df = pd.DataFrame(pos_tagged_data, columns=["Word", "POS"])

# pos_tagged_df

In [16]:
# Bag of H-grams (unigrams, bigrams, trigrams)
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=5000)
ngram_features = count_vectorizer.fit_transform(data['process_text'])
ngram_df = pd.DataFrame(ngram_features.toarray(), columns=count_vectorizer.get_feature_names_out())

# ngram_df

In [18]:
 # Latent Dirichlet Allocation (LDA)
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_features = lda.fit_transform(ngram_features)
lda_df = pd.DataFrame(lda_features, columns=[f"Topic_{i+1}" for i in range(lda.n_components)])

# lda_df

In [7]:
# # Combine all features
# combined_df = pd.concat([tfidf_df, ngram_df, lda_df], axis=1)
# combined_df.head()

In [20]:
tfidf_df.head()

Unnamed: 0,000,00001010,00100000,01100001,01100100,01100101,01100111,01101000,01101001,01101100,...,youtube,youve,yr,yt,zany_face,zero,zoloft,zombie,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
ngram_df.head()

Unnamed: 0,00100000,01100001,01100101,01101110,01101111,01110010,01110100,01110101,10,10 minute,...,younger,youre,youre going,youth,youtube,youve,yr,zero,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
lda_df.head()

Unnamed: 0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5
0,0.003019,0.987837,0.003081,0.002987,0.003077
1,0.010623,0.010771,0.957358,0.01053,0.01072
2,0.481153,0.301032,0.188816,0.014344,0.014655
3,0.025101,0.025344,0.898917,0.02503,0.025608
4,0.000864,0.869612,0.091377,0.000856,0.037291
