In [1]:
#preprocess............
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



# FILE PATH
df = pd.read_csv(r'C:\Users\nh013\Desktop\Mental Health FAQ for Chatbot\Mental_Health_FAQ.csv')



# FUNCTION TO PERFORM NLP PREPROCESS
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # STEMMING
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # JOIN TOKEN BACK INTO TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# APPLY PREPROCESS FUNCTION TO QUESTION AND ANSWER COL
df['Questions'] = df['Questions'].apply(preprocess_text)
df['Answers'] = df['Answers'].apply(preprocess_text)


print("Preprocessed DataFrame:")
print(df)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed DataFrame:
    Question_ID                       Questions  \
0       1590140               mean mental ill ?   
1       2110618             mental ill affect ?   
2       6361820               caus mental ill ?   
3       9434130          warn sign mental ill ?   
4       7657263        peopl mental ill recov ?   
..          ...                             ...   
93      4373204            know 'm drink much ?   
94      7807643        cannabi danger , legal ?   
95      4352464          convinc kid use drug ?   
96      6521784  legal statu ( evid ) cbd oil ?   
97      3221856                     evid vape ?   

                                              Answers  
0   mental ill health condit disrupt personâ€™ tho...  
1   estim mental ill affect 1 5 adult america , 1 ...  
2   estim mental ill affect 1 5 adult america , 1 ...  
3   symptom mental health disord vari depend type ...  
4   heal mental ill , earli identif treatment vita...  
..                         

In [2]:
#perform text vectorization

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



# FILE PATH
df = pd.read_csv(r'C:\Users\nh013\Desktop\Mental Health FAQ for Chatbot\Mental_Health_FAQ.csv')



# FUNCTION TO PERFORM NLP PREPROCESS
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # STEMMING
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # JOIN TOKEN BACK INTO TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# APPLY PREPROCESS FUNCTION TO QUESTION AND ANSWER COL
df['Questions'] = df['Questions'].apply(preprocess_text)
df['Answers'] = df['Answers'].apply(preprocess_text)


# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_questions = tfidf_vectorizer.fit_transform(df['Questions'])
tfidf_matrix_answers = tfidf_vectorizer.fit_transform(df['Answers'])


print("TF-IDF Matrix for Questions:")
print(tfidf_matrix_questions.toarray())

print("TF-IDF Matrix for Answers:")
print(tfidf_matrix_answers.toarray())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF Matrix for Questions:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
TF-IDF Matrix for Answers:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
#check shape 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



# FILE PATH
df = pd.read_csv(r'C:\Users\nh013\Desktop\Mental Health FAQ for Chatbot\Mental_Health_FAQ.csv')



# FUNCTION TO PERFORM NLP PREPROCESS
def preprocess_text(text):
    
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    
    # CONVERT TO LOWERCASE
    text = text.lower()
    
    # TOKENIZATION
    tokens = word_tokenize(text)
    
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # STEMMING
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # JOIN TOKEN BACK INTO TEXT
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# APPLY PREPROCESS FUNCTION TO QUESTION AND ANSWER COL
df['Questions'] = df['Questions'].apply(preprocess_text)
df['Answers'] = df['Answers'].apply(preprocess_text)


# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_questions = tfidf_vectorizer.fit_transform(df['Questions'])
tfidf_matrix_answers = tfidf_vectorizer.fit_transform(df['Answers'])


#COMBINE'Questions' and 'Answers' INTO A SINGLE TEXT COL
df['Text'] = df['Questions'] + ' ' + df['Answers']


# SPLIT THE DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Answers'], test_size=0.2, random_state=42)


print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Shape of X_train: (78,)
Shape of X_test: (20,)
Shape of y_train: (78,)
Shape of y_test: (20,)


In [21]:
#PERFORM SEQUENCE TO SEQUENCE MODEL 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# FILE PATH
df = pd.read_csv(r'C:\Users\nh013\Desktop\Mental Health FAQ for Chatbot\Mental_Health_FAQ.csv')

# FUNCTION TO PERFORM NLP PREPROCESS
def preprocess_text(text):
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    # CONVERT TO LOWERCASE
    text = text.lower()
    # TOKENIZATION
    tokens = word_tokenize(text)
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # STEMMING
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # JOIN TOKEN BACK INTO TEXT
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text



# APPLY PREPROCESS FUNCTION TO QUESTION AND ANSWER COL
df['Questions'] = df['Questions'].apply(preprocess_text)
df['Answers'] = df['Answers'].apply(preprocess_text)

# COMBINE 'Questions' and 'Answers' INTO A SINGLE TEXT COL
df['Text'] = df['Questions'] + ' ' + df['Answers']




# SPLIT THE DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Answers'], test_size=0.2, random_state=42)

# TF-IDF VECTORIZATION FOR TRAINING DATA 
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)



#   X_train_tfidf IS A  2D ARRAY, RESHAPE IT TO  3D
X_train_reshaped = X_train_tfidf.toarray().reshape(X_train_tfidf.shape[0], 1, X_train_tfidf.shape[1])

# ONE HOT ENCODE THE TARGET SEQUENCE
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))



# BUILD A SIMPLE SEQUENCE-TO-SEQUENCE MODEL
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(RepeatVector(X_train_reshaped.shape[1]))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(encoder.categories_[0].shape[0], activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# RESHAPE y_train_ENCODED TO HAVE THE SAME SHAPE AS THE MODEL OUTPUT
y_train_encoded_reshaped = y_train_encoded.reshape(y_train_encoded.shape[0], 1, y_train_encoded.shape[1])



# FIT THE MODEL
history = model.fit(X_train_reshaped, y_train_encoded_reshaped, epochs=10, batch_size=32, validation_split=0.2)

# EVALUATE THE MODEL
test_loss, test_accuracy = model.evaluate(X_train_reshaped, y_train_encoded_reshaped)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')



# PREDICT ON TEST SET
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_reshaped = X_test_tfidf.toarray().reshape(X_test_tfidf.shape[0], 1, X_test_tfidf.shape[1])
predictions = model.predict(X_test_reshaped)

# PRINT SAMPLE PREDICTIONS
for i in range(5):  
    print(f"Input: {X_test.iloc[i]}")
    
    # DECODE ONE HOT ENCODED PREDICTION
    decoded_output = encoder.inverse_transform(predictions[i].reshape(1, -1))
    
    # JOIN TOKENS BACK INTO TEXT 
    decoded_output_text = ' '.join(decoded_output[0])
    
    print(f"Predicted Output: {decoded_output_text}")
    print("\n")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 4.345366477966309, Test Accuracy: 0.4871794879436493
Input: 's differ antidepress ? mani differ type antidepress medic , work differ way . antidepress divid “ class ” base chemic messeng brain ( call neurotransmitt ) thought influenc . class may contain sever differ medic , slightli differ way work . , ’ find common class exampl common medic . first name gener name name bracket brand name . ssri select serotonin reuptak inhibitor : fluoxetin ( prozac ) , paroxetin ( paxil ) , citalopram ( celexa ) , escitalopram ( cipralex ) , sertralin ( zoloft ) snri serotonin norepinephrin reuptak inhibitor : venlafaxin ( effexor ) duloxetin ( cymbalta ) ndri norepinephrine-dopamin reuptak inhibitor : bupropion ( wellbutrin zyban ) nassa noradrenerg specif serotonerg antidepress : mirtazapin ( remeron ) , also class teca tetracycl antidepress sari serotonin antagonist reuptak inh

In [6]:
# PERFORM FINE TUNING TO GET HIGH ACCURACY

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



# FILE PATH
df = pd.read_csv(r'C:\Users\nh013\Desktop\Mental Health FAQ for Chatbot\Mental_Health_FAQ.csv')

# FUNCTION TO PERFORM NLP PREPROCESS
def preprocess_text(text):
    # REMOVE URLS
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    # CONVERT TO LOWERCASE
    text = text.lower()
    # TOKENIZATION
    tokens = word_tokenize(text)
    # REMOVE STOP WORDS
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # STEMMING
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # LEMMATIZATION
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # JOIN TOKEN BACK INTO TEXT
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# APPLY PREPROCESS FUNCTION TO QUESTION AND ANSWER COL
df['Questions'] = df['Questions'].apply(preprocess_text)
df['Answers'] = df['Answers'].apply(preprocess_text)

# COMBINE 'Questions' and 'Answers' INTO A SINGLE TEXT COL
df['Text'] = df['Questions'] + ' ' + df['Answers']



# SPLIT THE DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Answers'], test_size=0.2, random_state=42)

# TF-IDF Vectorization for training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)




#  X_train_tfidf IS A  2D ARRAY, RESHAPE IT TO  3D
X_train_reshaped = X_train_tfidf.toarray().reshape(X_train_tfidf.shape[0], 1, X_train_tfidf.shape[1])

# ONE HOT ENCODE THE TARGET SEQUENCE
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))




# BUILD A SIMPLE SEQUENCE-TO-SEQUENCE MODEL
model = Sequential()
model.add(LSTM(100, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(RepeatVector(X_train_reshaped.shape[1]))
model.add(LSTM(100, return_sequences=True))
model.add(TimeDistributed(Dense(encoder.categories_[0].shape[0], activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# RESHAPE y_train_ENCODED TO HAVE THE SAME SHAPE AS THE MODEL OUTPUT
y_train_encoded_reshaped = y_train_encoded.reshape(y_train_encoded.shape[0], 1, y_train_encoded.shape[1])

# FIT THE MODEL
history = model.fit(X_train_reshaped, y_train_encoded_reshaped, epochs=20, batch_size=64, validation_split=0.2)

# EVALUATE THE MODEL
test_loss, test_accuracy = model.evaluate(X_train_reshaped, y_train_encoded_reshaped)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# PREDICT ON TEST SET
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_reshaped = X_test_tfidf.toarray().reshape(X_test_tfidf.shape[0], 1, X_test_tfidf.shape[1])
predictions = model.predict(X_test_reshaped)

# PRINT SAMPLE PREDICTIONS
for i in range(5):  
    print(f"Input: {X_test.iloc[i]}")
    
    # DECODE ONE HOT ENCODED PREDICTIONS
    decoded_output = encoder.inverse_transform(predictions[i].reshape(1, -1))
    
    # JOIN TOKENS BACK INTO TEXT
    decoded_output_text = ' '.join(decoded_output[0])
    
    print(f"Predicted Output: {decoded_output_text}")
    print("\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 4.324565410614014, Test Accuracy: 0.6410256624221802
Input: 's differ antidepress ? mani differ type antidepress medic , work differ way . antidepress divid “ class ” base chemic messeng brain ( call neurotransmitt ) thought influenc . class may contain sever differ medic , slightli differ way work . , ’ find common class exampl common medic . first name gener name name bracket brand name . ssri select serotonin reuptak inhibitor : fluoxetin ( prozac ) , paroxetin ( paxil ) , citalopram ( celexa ) , escitalopram ( cipralex ) , sertralin ( zoloft ) snri serotonin norepinephrin reuptak inhibitor : venlafaxin ( effexor ) duloxetin ( cymbalta ) ndri norepinephrine-dopamin reuptak inhibitor : bupropion ( wellbutrin zyban ) nassa noradrenerg specif serot