In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter

# Download NLTK data (only required once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Complete dataset
data = [
    ("I've been feeling quite good lately, enjoying my hobbies and spending time with friends.", "Normal"),
    ("I don't feel like doing anything anymore. Everything seems pointless.", "Depressed"),
    ("I'm feeling pretty stable these days, not much bothers me.", "Normal"),
    ("Waking up in the morning is a struggle. I feel drained and empty.", "Depressed"),
    ("I find pleasure in small things and feel content with my life.", "Normal"),
    ("Nothing interests me anymore, and I can't shake this feeling of sadness.", "Depressed"),
    ("I've been productive and focused, achieving my goals without much stress.", "Normal"),
    ("Every day feels the same, and I just want to disappear.", "Depressed"),
    ("I feel connected with my loved ones and enjoy our time together.", "Normal"),
    ("It's hard to even get out of bed; everything feels heavy and overwhelming.", "Depressed"),
    ("My mood has been stable, and I'm coping well with daily challenges.", "Normal"),
    ("I don't see the point in talking to anyone. I feel so isolated and alone.", "Depressed"),
    ("I've been laughing more and feeling optimistic about the future.", "Normal"),
    ("Nothing makes me happy anymore; I feel numb.", "Depressed"),
    ("I'm at peace with where I am in life and look forward to what's next.", "Normal"),
    ("I cry for no reason, and the sadness won't go away.", "Depressed"),
    ("I feel energized and motivated to take on new projects.", "Normal"),
    ("It's like a dark cloud is hanging over me, and I can't escape it.", "Depressed"),
    ("I've been feeling emotionally balanced and content.", "Normal"),
    ("I don't care about anything anymore. Life just feels so bleak.", "Depressed"),
    ("I'm happy with my life and the progress I'm making.", "Normal"),
    ("The sadness never leaves, and I feel hopeless all the time.", "Depressed"),
    ("I've been enjoying my work and feeling satisfied with my achievements.", "Normal"),
    ("I feel like I'm drowning in despair and can't find a way out.", "Depressed"),
    ("I'm feeling well and enjoying time with my family.", "Normal"),
    ("I don't see a future for myself; everything seems so dark.", "Depressed"),
    ("I'm able to relax and enjoy my free time.", "Normal"),
    ("It's hard to even smile; I feel so down and out.", "Depressed"),
    ("I've been feeling calm and at ease with my life.", "Normal"),
    ("The weight of my thoughts is crushing me, and I feel so low.", "Depressed"),
    ("I'm looking forward to the weekend and spending time outdoors.", "Normal"),
    ("I can't find joy in anything; I feel so empty inside.", "Depressed"),
    ("I've been feeling good about my relationships and personal growth.", "Normal"),
    ("The sadness is overwhelming, and I can't see a way through it.", "Depressed"),
    ("I feel relaxed and content with how things are going.", "Normal"),
    ("I can't stop thinking about how worthless I feel.", "Depressed"),
    ("I've been in a positive mood and enjoying life's little moments.", "Normal"),
    ("I feel so detached from everything, like I'm just going through the motions.", "Depressed"),
    ("I've been handling stress well and staying optimistic.", "Normal"),
    ("It's like a constant fog of sadness that won't lift.", "Depressed"),
    ("I'm enjoying my hobbies and finding fulfillment in my daily routine.", "Normal"),
    ("I feel so lost and hopeless, like there's no point in anything.", "Depressed"),
    ("I've been feeling happy and content with my progress in life.", "Normal"),
    ("I don't want to be around people; I just want to be alone in my sadness.", "Depressed"),
    ("I'm in a good place emotionally, feeling positive and hopeful.", "Normal"),
    ("Everything feels meaningless, and I can't escape the sadness.", "Depressed"),
    ("I've been feeling good about myself and my future.", "Normal"),
    ("I can't find the strength to face the day; everything feels so dark.", "Depressed"),
    ("I'm at peace with my life and enjoying my time with loved ones.", "Normal"),
    ("I feel like I'm just existing, not really living.", "Depressed"),
    ("I've been feeling upbeat and looking forward to new experiences.", "Normal"),
    ("The sadness never leaves, and I feel like I'm stuck in a dark hole.", "Depressed"),
    ("I'm enjoying my life and feel grateful for what I have.", "Normal"),
    ("I can't seem to escape these negative thoughts; they consume me.", "Depressed"),
    ("I'm feeling positive about my future and the direction I'm heading.", "Normal"),
    ("It's hard to get through the day without feeling completely overwhelmed.", "Depressed"),
    ("I'm feeling good about where I am and the progress I'm making.", "Normal"),
    ("I feel so disconnected from the world around me.", "Depressed"),
    ("I've been happy and content with my work-life balance.", "Normal"),
    ("Nothing brings me joy anymore, and I feel so empty.", "Depressed"),
    ("I'm satisfied with how things are going in my life.", "Normal"),
    ("The sadness is relentless, and I can't seem to shake it off.", "Depressed"),
    ("I've been feeling good about my health and overall well-being.", "Normal"),
    ("I feel like I'm trapped in my own mind, and I can't get out.", "Depressed"),
    ("I've been content with my life and enjoying my daily routine.", "Normal"),
    ("Everything feels so heavy, like I'm carrying the weight of the world.", "Depressed"),
    ("I'm in a good mood and feeling positive about the future.", "Normal"),
    ("The sadness is overwhelming, and I feel so alone.", "Depressed"),
    ("I've been feeling emotionally stable and content with my life.", "Normal"),
    ("I feel like I'm drowning in a sea of sadness, and there's no way out.", "Depressed"),
    ("I've been feeling good and enjoying the little things in life.", "Normal"),
    ("I feel so hopeless and can't find a way to move forward.", "Depressed"),
    ("I'm feeling positive about the direction my life is taking.", "Normal"),
    ("The sadness is always there, lurking in the background.", "Depressed"),
    ("I've been feeling well and content with where I am in life.", "Normal"),
    ("I feel like I'm just going through the motions, not really living.", "Depressed"),
    ("I'm happy with how things are going in my life and feel at peace.", "Normal"),
    ("The sadness is overwhelming, and I can't escape it.", "Depressed"),
    ("I've been feeling optimistic about the future and excited about new opportunities.", "Normal"),
    ("I feel so empty inside, like nothing matters anymore.", "Depressed"),
    ("I'm satisfied with how things are going and feeling positive about the future.", "Normal"),
    ("The sadness is suffocating, and I can't breathe.", "Depressed"),
    ("I've been feeling happy and content with my relationships and personal growth.", "Normal"),
    ("I feel like I'm stuck in a dark place, and I can't find a way out.", "Depressed"),
    ("I've been feeling good about my life and the direction I'm heading.", "Normal"),
    ("The sadness is overwhelming, and I can't escape this darkness.", "Depressed"),
    ("I'm feeling positive and looking forward to the future.", "Normal"),
    ("I feel so lost and alone, like no one understands me.", "Depressed"),
    ("I've been feeling content with my life and the progress I've made.", "Normal"),
    ("The sadness never leaves, and I can't see a way out.", "Depressed"),
    ("I've been enjoying life and feeling good about my future.", "Normal"),
    ("I feel like I'm drowning in sadness, and there's no one to save me.", "Depressed"),
    ("I'm happy with where I am in life and looking forward to new challenges.", "Normal"),
    ("The sadness is overwhelming, and I can't escape it.", "Depressed"),
    ("I've been feeling positive and excited about the future.", "Normal"),
    ("I feel like I'm trapped in a never-ending cycle of sadness.", "Depressed")
]

# Convert to DataFrame
df = pd.DataFrame(data, columns=['Comment', 'Label'])

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['Cleaned_Comment'] = df['Comment'].apply(clean_text)

# Tokenization
df['Tokens'] = df['Cleaned_Comment'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Vectorization (Bag of Words)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Cleaned_Comment'])

# Convert to DataFrame for better readability
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display results
print("Original Comments:\n", df['Comment'].head(), "\n")
print("Cleaned Comments:\n", df['Cleaned_Comment'].head(), "\n")
print("Tokenized Comments:\n", df['Tokens'].head(), "\n")
print("Bag of Words Representation:\n", bow_df.head(), "\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original Comments:
 0    I've been feeling quite good lately, enjoying ...
1    I don't feel like doing anything anymore. Ever...
2    I'm feeling pretty stable these days, not much...
3    Waking up in the morning is a struggle. I feel...
4    I find pleasure in small things and feel conte...
Name: Comment, dtype: object 

Cleaned Comments:
 0    ive been feeling quite good lately enjoying my...
1    i dont feel like doing anything anymore everyt...
2    im feeling pretty stable these days not much b...
3    waking up in the morning is a struggle i feel ...
4    i find pleasure in small things and feel conte...
Name: Cleaned_Comment, dtype: object 

Tokenized Comments:
 0    [ive, feeling, quite, good, lately, enjoying, ...
1    [dont, feel, like, anything, anymore, everythi...
2    [im, feeling, pretty, stable, days, much, both...
3    [waking, morning, struggle, feel, drained, empty]
4    [find, pleasure, small, things, feel, content,...
Name: Tokens, dtype: object 

Bag of Words Re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Feature Engineering: Part-of-Speech (POS) Tagging
df['POS_Tags'] = df['Tokens'].apply(nltk.pos_tag)
print("POS Tags:\n", df['POS_Tags'].head(), "\n")

POS Tags:
 0    [(ive, JJ), (feeling, NN), (quite, RB), (good,...
1    [(dont, JJ), (feel, NN), (like, IN), (anything...
2    [(im, NN), (feeling, VBG), (pretty, RB), (stab...
3    [(waking, VBG), (morning, NN), (struggle, NN),...
4    [(find, VB), (pleasure, NN), (small, JJ), (thi...
Name: POS_Tags, dtype: object 



In [5]:
# Unigrams
vectorizer_uni = CountVectorizer(ngram_range=(1, 1))
X_uni = vectorizer_uni.fit_transform(df['Cleaned_Comment'])
print("Unigrams:\n", pd.DataFrame(X_uni.toarray(), columns=vectorizer_uni.get_feature_names_out()).head(), "\n")

Unigrams:
    able  about  achievements  achieving  all  alone  always  am  and  anymore  \
0     0      0             0          0    0      0       0   0    1        0   
1     0      0             0          0    0      0       0   0    0        1   
2     0      0             0          0    0      0       0   0    0        0   
3     0      0             0          0    0      0       0   0    1        0   
4     0      0             0          0    0      0       0   0    1        0   

   ...  what  whats  where  with  without  wont  work  worklife  world  \
0  ...     0      0      0     1        0     0     0         0      0   
1  ...     0      0      0     0        0     0     0         0      0   
2  ...     0      0      0     0        0     0     0         0      0   
3  ...     0      0      0     0        0     0     0         0      0   
4  ...     0      0      0     1        0     0     0         0      0   

   worthless  
0          0  
1          0  
2          0

In [6]:
# Bigrams
vectorizer_bi = CountVectorizer(ngram_range=(2, 2))
X_bi = vectorizer_bi.fit_transform(df['Cleaned_Comment'])

In [7]:
# Trigrams
vectorizer_tri = CountVectorizer(ngram_range=(3, 3))
X_tri = vectorizer_tri.fit_transform(df['Cleaned_Comment'])

In [9]:
# Feature Engineering: TF-IDF Vectorization (alternative to N-grams)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_tfidf = tfidf_vectorizer.fit_transform(df['Cleaned_Comment'])

In [10]:
# Feature Engineering: Clustering (using KMeans)
num_clusters = 2  # Assuming 2 clusters: Normal and Depressed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_tfidf)

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
print("Bigrams:\n", pd.DataFrame(X_bi.toarray(), columns=vectorizer_bi.get_feature_names_out()).head(), "\n")
print("Trigrams:\n", pd.DataFrame(X_tri.toarray(), columns=vectorizer_tri.get_feature_names_out()).head(), "\n")
print("TF-IDF Features:\n", pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()).head(), "\n")
print("Clustering Results (Cluster Labels):\n", df[['Comment', 'Cluster']].head(), "\n")

Bigrams:
    able to  about anything  about how  about my  about myself  about new  \
0        0               0          0         0             0          0   
1        0               0          0         0             0          0   
2        0               0          0         0             0          0   
3        0               0          0         0             0          0   
4        0               0          0         0             0          0   

   about the  about where  achieving my  all the  ...  with my  with where  \
0          0            0             0        0  ...        0           0   
1          0            0             0        0  ...        0           0   
2          0            0             0        0  ...        0           0   
3          0            0             0        0  ...        0           0   
4          0            0             0        0  ...        1           0   

   without feeling  without much  wont go  wont lift  work and  