In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Combined Data.csv')


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [5]:
df.isnull().sum()

Unnamed: 0      0
statement     362
status          0
dtype: int64

In [6]:
df['status'].unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [7]:
mode_statement = df['statement'].mode()[0]
df['statement'].fillna(mode_statement,inplace=True)
df.isnull().sum()

Unnamed: 0    0
statement     0
status        0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   53043 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [9]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
def clean_text(text):

    text = text.lower()

    text = "".join([char for char in text if char not in string.punctuation])

    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [11]:
a = df['statement'][1]
a


'trouble sleeping, confused mind, restless heart. All out of tune'

In [12]:

b = clean_text(a)
print(b)

trouble sleeping confused mind restless heart tune


In [13]:
df['clean_statement'] = df['statement'].apply(clean_text)

In [14]:
df

Unnamed: 0.1,Unnamed: 0,statement,status,clean_statement
0,0,oh my gosh,Anxiety,oh gosh
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,3,I've shifted my focus to something else but I'...,Anxiety,ive shifted focus something else im still worried
4,4,"I'm restless and restless, it's been a month n...",Anxiety,im restless restless month boy mean
...,...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety,nobody take seriously i’ve 24m dealt depressio...
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety,selfishness dont feel good like dont belong wo...
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety,way sleep better cant sleep night med didnt help
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety,public speaking tip hi give presentation work ...


In [18]:
df = df.head(1000)

In [19]:
df

Unnamed: 0.1,Unnamed: 0,statement,status,clean_statement
0,0,oh my gosh,Anxiety,oh gosh
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...
3,3,I've shifted my focus to something else but I'...,Anxiety,ive shifted focus something else im still worried
4,4,"I'm restless and restless, it's been a month n...",Anxiety,im restless restless month boy mean
...,...,...,...,...
995,995,please make it,Normal,please make
996,996,Okay then okay,Normal,okay okay
997,997,My pain is big,Normal,pain big
998,998,My whole life is full of surprise,Normal,whole life full surprise


In [22]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_statement'])
y = df['status']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naive Bayes': MultinomialNB()
}

In [28]:
best_model = None
best_accuracy = 0

In [30]:
for name, model in models.items():
    print(f"evaluating {name}..............\n")
    if name == 'Logistic Regression':
        param_grid = {'C': [0.1, 1, 10]}
    elif name == 'Random Forest':
        param_grid = {'n_estimators': [50, 100, 200]}
    elif name == 'SVM':
        param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    else:  # Naive Bayes
        param_grid = {}
        
        
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = grid_search.best_estimator_
        
print(f"\nBest performing model: {type(best_model).__name__} with accuracy: {best_accuracy}")

evaluating Logistic Regression..............

Accuracy: 0.94
              precision    recall  f1-score   support

     Anxiety       0.96      0.96      0.96       147
      Normal       0.89      0.89      0.89        53

    accuracy                           0.94       200
   macro avg       0.92      0.92      0.92       200
weighted avg       0.94      0.94      0.94       200

evaluating Random Forest..............

Accuracy: 0.95
              precision    recall  f1-score   support

     Anxiety       0.98      0.95      0.97       147
      Normal       0.88      0.94      0.91        53

    accuracy                           0.95       200
   macro avg       0.93      0.95      0.94       200
weighted avg       0.95      0.95      0.95       200

evaluating SVM..............

Accuracy: 0.955
              precision    recall  f1-score   support

     Anxiety       0.98      0.96      0.97       147
      Normal       0.89      0.94      0.92        53

    accuracy        