In [2]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from textblob import TextBlob


In [3]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\anjan\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anjan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anjan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
import pandas as pd

In [5]:
df=pd.read_csv("Combined Data.csv")

In [6]:
df=df[df['status']!="bipolar"]

In [7]:
df=df.dropna()
df.info

<bound method DataFrame.info of        Unnamed: 0                                          statement   status
0               0                                         oh my gosh  Anxiety
1               1  trouble sleeping, confused mind, restless hear...  Anxiety
2               2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3               3  I've shifted my focus to something else but I'...  Anxiety
4               4  I'm restless and restless, it's been a month n...  Anxiety
...           ...                                                ...      ...
53038       53038  Nobody takes me seriously I’ve (24M) dealt wit...  Anxiety
53039       53039  selfishness  "I don't feel very good, it's lik...  Anxiety
53040       53040  Is there any way to sleep better? I can't slee...  Anxiety
53041       53041  Public speaking tips? Hi, all. I have to give ...  Anxiety
53042       53042  I have really bad door anxiety! It's not about...  Anxiety

[52681 rows x 3 columns]>

In [8]:
df.drop(columns=['Unnamed: 0'],inplace=True,index=0)

In [9]:
df.info

<bound method DataFrame.info of                                                statement   status
1      trouble sleeping, confused mind, restless hear...  Anxiety
2      All wrong, back off dear, forward doubt. Stay ...  Anxiety
3      I've shifted my focus to something else but I'...  Anxiety
4      I'm restless and restless, it's been a month n...  Anxiety
5      every break, you must be nervous, like somethi...  Anxiety
...                                                  ...      ...
53038  Nobody takes me seriously I’ve (24M) dealt wit...  Anxiety
53039  selfishness  "I don't feel very good, it's lik...  Anxiety
53040  Is there any way to sleep better? I can't slee...  Anxiety
53041  Public speaking tips? Hi, all. I have to give ...  Anxiety
53042  I have really bad door anxiety! It's not about...  Anxiety

[52680 rows x 2 columns]>

In [10]:
sia=SentimentIntensityAnalyzer()

In [11]:
def get_vader_sentiment(text):
    return sia.polarity_scores(text)

def get_textblob_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply sentiment analysis
df['VADER_Sentiment'] = df['statement'].apply(lambda x: get_vader_sentiment(x)['compound'])
df['TextBlob_Polarity'] = df['statement'].apply(get_textblob_sentiment)

# Function to map sentiment score to categories based on thresholds
def categorize_sentiment(vader_score, textblob_score):
    if vader_score < -0.5 or textblob_score < -0.5:
        return 'Sucidial'
    elif -0.5 <= vader_score < -0.1 or -0.5 <= textblob_score < -0.1:
        return 'Depression'
    elif -0.1 <= vader_score < 0.1 or -0.1 <= textblob_score < 0.1:
        return 'Anxiety'
    else:
        return 'Normal'

# Categorize based on both VADER and TextBlob sentiment scores
df['Predicted_Category'] = df.apply(lambda row: categorize_sentiment(row['VADER_Sentiment'], row['TextBlob_Polarity']), axis=1)

# Display the DataFrame with the calculated sentiments and predicted categories
print(df[['statement', 'VADER_Sentiment', 'TextBlob_Polarity', 'Predicted_Category']])

                                               statement  VADER_Sentiment  \
1      trouble sleeping, confused mind, restless hear...          -0.7269   
2      All wrong, back off dear, forward doubt. Stay ...          -0.7351   
3      I've shifted my focus to something else but I'...          -0.4215   
4      I'm restless and restless, it's been a month n...          -0.4939   
5      every break, you must be nervous, like somethi...          -0.2144   
...                                                  ...              ...   
53038  Nobody takes me seriously I’ve (24M) dealt wit...           0.1704   
53039  selfishness  "I don't feel very good, it's lik...          -0.9834   
53040  Is there any way to sleep better? I can't slee...           0.1635   
53041  Public speaking tips? Hi, all. I have to give ...          -0.6853   
53042  I have really bad door anxiety! It's not about...          -0.8603   

       TextBlob_Polarity Predicted_Category  
1              -0.300000     

In [12]:
df.info

<bound method DataFrame.info of                                                statement   status  \
1      trouble sleeping, confused mind, restless hear...  Anxiety   
2      All wrong, back off dear, forward doubt. Stay ...  Anxiety   
3      I've shifted my focus to something else but I'...  Anxiety   
4      I'm restless and restless, it's been a month n...  Anxiety   
5      every break, you must be nervous, like somethi...  Anxiety   
...                                                  ...      ...   
53038  Nobody takes me seriously I’ve (24M) dealt wit...  Anxiety   
53039  selfishness  "I don't feel very good, it's lik...  Anxiety   
53040  Is there any way to sleep better? I can't slee...  Anxiety   
53041  Public speaking tips? Hi, all. I have to give ...  Anxiety   
53042  I have really bad door anxiety! It's not about...  Anxiety   

       VADER_Sentiment  TextBlob_Polarity Predicted_Category  
1              -0.7269          -0.300000           Sucidial  
2            

In [13]:
label_mapping = {'Anxiety': 0, 'Normal': 3, 'Depression': 1, 'Sucidial': 2}
df['Category'] = df['Predicted_Category'].map(label_mapping)

In [14]:
df=df.drop(columns=['statement'],axis=1)
df.info

<bound method DataFrame.info of         status  VADER_Sentiment  TextBlob_Polarity Predicted_Category  \
1      Anxiety          -0.7269          -0.300000           Sucidial   
2      Anxiety          -0.7351          -0.250000           Sucidial   
3      Anxiety          -0.4215           0.000000         Depression   
4      Anxiety          -0.4939          -0.312500         Depression   
5      Anxiety          -0.2144          -0.500000         Depression   
...        ...              ...                ...                ...   
53038  Anxiety           0.1704           0.134932             Normal   
53039  Anxiety          -0.9834          -0.003266           Sucidial   
53040  Anxiety           0.1635           0.500000             Normal   
53041  Anxiety          -0.6853          -0.035714           Sucidial   
53042  Anxiety          -0.8603          -0.119444           Sucidial   

       Category  
1             2  
2             2  
3             1  
4             1  
5

In [15]:
X=df['VADER_Sentiment']
print(X)
X=X.values
y=df['Category'].values
X=X.reshape(-1,1)
y=y.reshape(-1,1)

1       -0.7269
2       -0.7351
3       -0.4215
4       -0.4939
5       -0.2144
          ...  
53038    0.1704
53039   -0.9834
53040    0.1635
53041   -0.6853
53042   -0.8603
Name: VADER_Sentiment, Length: 52680, dtype: float64


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

In [18]:
# svm_classifier = svm.SVC(kernel='linear', C=1.0)
# svm_classifier.fit(X_train, y_train)
# y_pred=svm_classifier.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

In [19]:
X_train.shape, y_train.shape

((42144, 1), (42144, 1))

In [20]:
X_train

array([[ 0.    ],
       [-0.7568],
       [-0.9026],
       ...,
       [-0.9665],
       [ 0.5719],
       [-0.9402]])

In [21]:
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(n_estimators=100, random_state=42)
RF_model.fit(X_train,y_train)
y_pred=RF_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.8367501898253606


In [28]:
RF_model.predict([[-1]])

array([2], dtype=int64)

In [22]:
Standard_vector=SVC(kernel='linear',random_state=42)
Standard_vector.fit(X_train, y_train)
y_pred = Standard_vector.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


  y = column_or_1d(y, warn=True)


Accuracy: 0.8234624145785877
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.71      0.70      2701
           1       0.96      0.65      0.78      1478
           2       0.99      0.98      0.99      4632
           3       0.58      0.71      0.64      1725

    accuracy                           0.82     10536
   macro avg       0.80      0.76      0.78     10536
weighted avg       0.84      0.82      0.83     10536



In [23]:
import pickle
import pandas as pd
import h5py



In [24]:
pip install h5py

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [29]:
with open("rf_model.pkl", "wb") as file:
    pickle.dump(RF_model, file)

In [25]:
import pandas as pd
import pickle

# Read the CSV file in chunks
chunk_size = 53042  # Define the size of each chunk (adjust as needed)
csv_file = 'Combined Data.csv'

# Open a binary file to store the pickled data
with open('data.pkl', 'wb') as pickle_file:
    # Iterate over the CSV file in chunks
    for chunk in pd.read_csv(csv_file, chunksize=chunk_size):
        # Convert the chunk to a list of records (or keep it as a DataFrame if needed)
        chunk_list = chunk.to_dict(orient='records')
        
        # Dump each chunk to the pickle file
        pickle.dump(chunk_list, pickle_file)
        
        print(f"Saved chunk with {len(chunk_list)} records to pickle.")

Saved chunk with 53042 records to pickle.
Saved chunk with 1 records to pickle.
