In [1]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV file into a DataFrame
df_combined_data = pd.read_csv('/content/drive/MyDrive/sentiments_mentalH/Combined Data.csv')

# Display the first 5 rows
print(df_combined_data.head().to_markdown(index=False, numalign="left", stralign="left"))

# Show columns and their types
print(df_combined_data.info())


| Unnamed: 0   | statement                                                                      | status   |
|:-------------|:-------------------------------------------------------------------------------|:---------|
| 0            | oh my gosh                                                                     | Anxiety  |
| 1            | trouble sleeping, confused mind, restless heart. All out of tune               | Anxiety  |
| 2            | All wrong, back off dear, forward doubt. Stay in a restless and restless place | Anxiety  |
| 3            | I've shifted my focus to something else but I'm still worried                  | Anxiety  |
| 4            | I'm restless and restless, it's been a month now, boy. What do you mean?       | Anxiety  |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   stat

The dataset contains text data in the statement column and corresponding labels in the status column. We will preprocess the text data, then split it into training and testing sets. We will then vectorize the text data and train a machine learning model on the training set. Finally, we will evaluate the model on the testing set, print the accuracy and classification report, and save the model using pickle.

In [5]:
import pandas as pd
import re

# Drop rows with null values in `statement` column
df_combined_data.dropna(subset=['statement'], inplace=True)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)', '', text)
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    return text


# Apply preprocessing to the 'statement' column
df_combined_data['cleaned_statement'] = df_combined_data['statement'].astype(str).apply(preprocess_text)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X = df_combined_data['cleaned_statement']
y = df_combined_data['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF features using `TfidfVectorizer`
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a logistic regression classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Evaluate the model's performance on the test set using accuracy and a classification report
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Save the model and vectorizer
import pickle
import os

model_filename = 'mental_health_sentiment_model.pkl'
vectorizer_filename = 'tfidf_vectorizer.pkl'
folder_path = '/content/drive/MyDrive/sentiments_mentalH'

os.makedirs(folder_path, exist_ok=True)

with open(os.path.join(folder_path, model_filename), 'wb') as f:
    pickle.dump(model, f)

with open(os.path.join(folder_path, vectorizer_filename), 'wb') as f:
    pickle.dump(vectorizer, f)

print(f"Model saved to {os.path.join(folder_path, model_filename)}")
print(f"Vectorizer saved to {os.path.join(folder_path, vectorizer_filename)}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7773559836765683
Classification Report:
                       precision    recall  f1-score   support

             Anxiety       0.83      0.78      0.81       755
             Bipolar       0.84      0.75      0.79       527
          Depression       0.72      0.75      0.73      3016
              Normal       0.86      0.95      0.90      3308
Personality disorder       0.85      0.47      0.61       237
              Stress       0.70      0.52      0.60       536
            Suicidal       0.70      0.66      0.68      2158

            accuracy                           0.78     10537
           macro avg       0.79      0.70      0.73     10537
        weighted avg       0.77      0.78      0.77     10537

Model saved to /content/drive/MyDrive/sentiments_mentalH/mental_health_sentiment_model.pkl
Vectorizer saved to /content/drive/MyDrive/sentiments_mentalH/tfidf_vectorizer.pkl
