In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import numpy as np
import emoji
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_validate

# DATA COLLECTION

In [None]:
filkeywords = ['"patayin" AND "ang" AND sarili"', '"gusto" AND "ko" AND "ng" AND "mamatay"', '"pagpapakamatay"',
               '"hindi" AND "ipinanganak"', '"sana" AND "patay" AND "nalang"', '"hindi" AND "na" AND "magigising"', 
               '"ayoko" AND "ng" AND "mabuhay"','"tapusin" AND "ang" AND "buhay"', '"wala" AND "akong" AND "silbi"']


enkeywords = ['"depressed"', '"depression"', '"want" AND "to" AND "die"', '"kill" AND "myself"', '"suicide"', '"suicidal"', 
              '"no" AND "reason" AND "to" AND "live"', '"hate" AND "my" AND "life"', '"end" AND "my" AND "life"', 
              '"cut" AND "myself"']

tweets = []

In [None]:
#SCRAPE FILIPINO KEYWORDS
for i, k in enumerate(filkeywords):
    print("SCRAPING KEYWORD: ",filkeywords[i])
    query = filkeywords[i]+" until:2022-11-23 since:2020-01-01"
    for filtweet in sntwitter.TwitterSearchScraper(query).get_items():
        tweets.append([filtweet.content])

In [None]:
#SCRAPE ENGLISH KEYWORDS
for x, y in enumerate(enkeywords):
    print("SCRAPING KEYWORD: ",enkeywords[x])
    query = enkeywords[x]+" geocode:12.879721,121.774017,724km until:2022-11-23 since:2020-01-01"
    for entweet in sntwitter.TwitterSearchScraper(query).get_items():
        tweets.append([entweet.content])

In [None]:
df = pd.DataFrame(tweets, columns = ['Tweet']) # CREATE DATAFRAME
df.insert(1, "language", "") # INSERT A THE "LANGUAGE" COLUMN
df.insert(2, "Label", "") # INSERT A THE "LABEL" COLUMN
df.to_csv(r'D:\user\Thesis dataset\suicide_ideation.csv', encoding='utf-8', index=False) #SAVE AS CSV FILE

# DATA PREPROCESSING (CLEANING)

In [None]:
df = pd.read_csv(r'D:\user\Thesis dataset (updated)\PRESENTATION\suicide_ideation.csv') # LOAD CSV FILE

In [None]:
def clean_tweet(tweet):
    temp = tweet.lower() # lower case the letters
    temp = emoji.replace_emoji(temp, replace="") #remove emoji
    temp = re.sub("'", "", temp) # to avoid removing contractions
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove mentions
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove links
    temp = re.sub('[()!?]', ' ', temp) # remove punctations
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp) # remove non-alphanumeric characters
    temp = ''.join([i for i in temp if not i.isdigit()]) #remove numbers
    temp = " ".join(temp.split()) # remove whitespace
    
    return temp

In [None]:
df['Tweet'] = df['Tweet'].apply(clean_tweet) #APPLY THE FUNCTION TO THE "TWEET" COLUMN

In [None]:
df = df.drop_duplicates(subset=["Tweet"]) # drop duplicate rows

In [None]:
#REMOVE ROWS WITH EMPTY/NULL TWEET CELLS
df['Tweet'].replace('', np.nan, inplace=True)
df.dropna(subset=['Tweet'], inplace=True)

In [None]:
df.to_csv(r'D:\user\Thesis dataset (updated)\PRESENTATION\cleaned_suicide_ideation.csv', encoding='utf-8', index=False)

# DATA PREPROCESSING (LANGUAGE IDENTIFICATION)

In [None]:
df = pd.read_csv(r'D:\user\Thesis dataset (updated)\PRESENTATION\cleaned_suicide_ideation_v2.csv') # LOAD CSV FILE

In [None]:
#REMOVE ROWS THAT ARE NOT FILIPINO OR ENGLISH
df = df.loc[(df["language"] == "fil") | (df["language"] == "en")]

In [None]:
df.to_csv(r'D:\user\Thesis dataset (updated)\PRESENTATION\cleaned_suicide_ideation_v2.1.csv', encoding='utf-8', index=False)

# DATASET BALANCING

In [None]:
df = pd.read_csv(r'D:\user\Thesis dataset\suicide_ideation.csv') #LOAD CLEANED & LABELED DATASET

In [None]:
#SEPARATE TWEETS BASE ON PHRASES OR TERMS USED DURING SCRAPING
mask = df['Tweet'].str.contains('patayin') & df['Tweet'].str.contains('ang') & df['Tweet'].str.contains('sarili')
filtered_df1 = df[mask]
df.drop(filtered_df1.index, inplace=True)

mask = df['Tweet'].str.contains('sana') & df['Tweet'].str.contains('patay') & df['Tweet'].str.contains('nalang')
filtered_df2 = df[mask]
df.drop(filtered_df2.index, inplace=True)

mask = df['Tweet'].str.contains('gusto') & df['Tweet'].str.contains('ko') & df['Tweet'].str.contains('ng') & df['Tweet'].str.contains('mamatay')
filtered_df3 = df[mask]
df.drop(filtered_df3.index, inplace=True)

mask = df['Tweet'].str.contains('pagpapakamatay')
filtered_df4 = df[mask]
df.drop(filtered_df4.index, inplace=True)

mask = df['Tweet'].str.contains('hindi') & df['Tweet'].str.contains('ipinanganak')
filtered_df5 = df[mask]
df.drop(filtered_df5.index, inplace=True)

mask = df['Tweet'].str.contains('hindi') & df['Tweet'].str.contains('na') & df['Tweet'].str.contains('magigising')
filtered_df6 = df[mask]
df.drop(filtered_df6.index, inplace=True)

mask = df['Tweet'].str.contains('ayoko') & df['Tweet'].str.contains('ng') & df['Tweet'].str.contains('mabuhay')
filtered_df7 = df[mask]
df.drop(filtered_df7.index, inplace=True)

mask = df['Tweet'].str.contains('tapusin') & df['Tweet'].str.contains('ang') & df['Tweet'].str.contains('buhay')
filtered_df8 = df[mask]
df.drop(filtered_df8.index, inplace=True)

mask = df['Tweet'].str.contains('wala') & df['Tweet'].str.contains('akong') & df['Tweet'].str.contains('silbi')
filtered_df9 = df[mask]
df.drop(filtered_df9.index, inplace=True)

mask = df['Tweet'].str.contains('handa') & df['Tweet'].str.contains('na') & df['Tweet'].str.contains('akong') & df['Tweet'].str.contains('mamatay')
filtered_df10 = df[mask]
df.drop(filtered_df10.index, inplace=True)

mask = df['Tweet'].str.contains('depressed')
filtered_df11 = df[mask]
df.drop(filtered_df11.index, inplace=True)

mask = df['Tweet'].str.contains('suicide')
filtered_df12 = df[mask]
df.drop(filtered_df12.index, inplace=True)

mask = df['Tweet'].str.contains('depression')
filtered_df13 = df[mask]
df.drop(filtered_df13.index, inplace=True)

mask = df['Tweet'].str.contains('suicidal')
filtered_df14 = df[mask]
df.drop(filtered_df14.index, inplace=True)

mask = df['Tweet'].str.contains('want') & df['Tweet'].str.contains('to') & df['Tweet'].str.contains('die')
filtered_df15 = df[mask]
df.drop(filtered_df15.index, inplace=True)

mask = df['Tweet'].str.contains('kill') & df['Tweet'].str.contains('myself')
filtered_df16 = df[mask]
df.drop(filtered_df16.index, inplace=True)

mask = df['Tweet'].str.contains('no') & df['Tweet'].str.contains('reason') & df['Tweet'].str.contains('to') & df['Tweet'].str.contains('live') 
filtered_df17 = df[mask]
df.drop(filtered_df17.index, inplace=True)

mask = df['Tweet'].str.contains('hate') & df['Tweet'].str.contains('my') & df['Tweet'].str.contains('life')
filtered_df18 = df[mask]
df.drop(filtered_df18.index, inplace=True)

mask = df['Tweet'].str.contains('cut') & df['Tweet'].str.contains('myself')
filtered_df19 = df[mask]
df.drop(filtered_df19.index, inplace=True)

mask = df['Tweet'].str.contains('end') & df['Tweet'].str.contains('my') & df['Tweet'].str.contains('life')
filtered_df20 = df[mask]
df.drop(filtered_df20.index, inplace=True)

In [None]:
filtered_df_list = [filtered_df1, filtered_df2, filtered_df3, filtered_df4, filtered_df5, filtered_df6, filtered_df7,
                   filtered_df8, filtered_df9, filtered_df10, filtered_df11, filtered_df12, filtered_df13, filtered_df14]
filtered_df_list2 = [filtered_df15, filtered_df16, filtered_df17, filtered_df18, filtered_df19, filtered_df20]
indexnum = 0

In [None]:
for filtered_df in (filtered_df_list):
    
    # PERFORM DOWNSAMPLING
    at_risk = filtered_df[filtered_df.Label=='A'] #MINORITY CLASS
    not_at_risk = filtered_df[filtered_df.Label=='B'] #MAJORITY CLASS

    not_at_risk_downsampled = resample(not_at_risk,
                                n_samples=len(at_risk)) # match the minority class

    # Combine minority class with downsampled majority class
    downsampled = pd.concat([at_risk, not_at_risk_downsampled])
    
    if indexnum == 0:
        balanced_df = downsampled
    else:
        balanced_df = balanced_df.append(downsampled, ignore_index=True)
        
    indexnum += 1

In [None]:
#APPEND THE PHRASES AND TERMS THAT CONTAIN LESS THAN 100 TWEETS
for filtered_df in (filtered_df_list2):
    balanced_df = balanced_df.append(filtered_df, ignore_index=True)

In [None]:
#REPRESENT THE LABELS AS NUMBERS
# Define a mapping function
def map_class(x):
    if x == 'A':
        return 1
    elif x == 'B':
        return 0
    else:
        return None
balanced_df["Label"] = balanced_df["Label"].apply(map_class)

In [None]:
#SHUFFLE THE ROWS OF THE DATASET
balanced_df = balanced_df.reindex(np.random.permutation(balanced_df.index))

In [None]:
balanced_df.to_csv(r'D:\user\Thesis dataset\balanced_suicide_ideation.csv', encoding='utf-8', index=False) #SAVE AS CSV FILE

# TOKENIZATION AND VECTORIZATION

In [3]:
df = pd.read_csv(r'D:\user\Thesis dataset (updated)\BALANCED BY KEYWORD\shuffled_balanced_full_suicide_ideation.csv') # LOAD THE BALANCED LABELED DATASET

In [4]:
tfidf = TfidfVectorizer(min_df=5)

#GET THE FEATURES
features = df["Tweet"]
#GET THE LABEL
label = df["Label"]

X, y = features, label

#SPLIT THE DATASET INTO TRAINING AND TEST DATA
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=1, stratify=label)

#TOKENIZE AND VECTORIZE THE TRAINING FEATURES
X_train_vec = tfidf.fit_transform(X_train)

#VECTORIZE THE TEST FEATURES
X_test_vec = tfidf.transform(X_test)

#SAVE VOCABULARY
pickle.dump(tfidf, open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tfidf_vocab.pkl", "wb"))

# TRAINING AND TESTING OF MODELS WITH NO TUNING

In [6]:
rf = RandomForestClassifier() #MAKES USE OF THE BAGGING ENSEMBLE TECHNIQUE
xgb = XGBClassifier() #MAKES USE OF THE BOOSTING ENSEMBLE TECHNIQUE


#BASE MODELS FOR THE VOTING ENSEMBLE
voting_svm = SVC(probability=True)
voting_knn = KNeighborsClassifier()
voting_nb = MultinomialNB()
voting_rf = RandomForestClassifier()
voting_xgb = XGBClassifier()

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=[('voting_rf', voting_rf), 
                                            ('voting_xgb', voting_xgb), 
                                            ('voting_svm', voting_svm), 
                                            ('voting_knn', voting_knn), 
                                            ('voting_nb', voting_nb)], voting='soft')

#BASE MODELS FOR THE STACKING ENSEMBLE
stacking_svm = SVC(probability=True)
stacking_knn = KNeighborsClassifier()
stacking_nb = MultinomialNB()
stacking_rf = RandomForestClassifier()
stacking_xgb = XGBClassifier()

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=[('stacking_knn', stacking_knn), 
                                                ('stacking_rf', stacking_rf), 
                                                ('stacking_xgb', stacking_xgb), 
                                                ('stacking_nb', stacking_nb), 
                                                ('stacking_svm', stacking_svm)], cv=5)



In [None]:
rf.fit(X_train_vec, y_train) #TRAIN RANDOM FOREST MODEL
xgb.fit(X_train_vec, y_train) #TRAIN XGBOOST MODEL
voting_model.fit(X_train_vec,y_train) #TRAIN VOTING MODEL
stacking_model.fit(X_train_vec,y_train) #TRAIN STACKING MODEL

In [None]:
models = [rf,xgb,voting_model,stacking_model]

In [None]:
for model in models:
    y_pred = model.predict(X_test_vec) #MAKE PREDICTIONS WITH THE TRAINED MODELS USING THE TESTING SET
    
    #PRINT OUT THE SCORE OF MODEL WITH THE DIFFERENT PERFORMANCE METRICS
    print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
    print("PRECISION: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1 SCORE OF THE MODEL: ", metrics.f1_score(y_test, y_pred))

# MODEL TUNING

In [8]:
#CREATE THE SEARCH SPACE TO BE USED IN TUNING THE MODELS
rf_search = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [5, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

xgb_search = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
    'n_estimators': [50, 100, 200, 300],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.1, 0.5, 1, 10],
    'reg_lambda': [0, 0.1, 0.5, 1, 10]
}

voting_search = {
    'rf__n_estimators': [500, 1000],
    'rf__max_depth': [5, 10, 20],
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 4, 5, 6, 7],
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svm__gamma': ['scale', 'auto'],
    'knn__n_neighbors': [3, 5, 7, 9, 11, 13],
    'knn__weights': ['uniform', 'distance'],
    'nb__alpha': [0.1, 0.5, 1.0, 2.0],
}

stacking_search = {
    'rf__n_estimators': [500, 1000],
    'rf__max_depth': [5, 10, 20],
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [3, 4, 5, 6, 7],
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svm__gamma': ['scale', 'auto'],
    'knn__n_neighbors': [3, 5, 7, 9, 11, 13],
    'knn__weights': ['uniform', 'distance'],
    'nb__alpha': [0.1, 0.5, 1.0, 2.0],
}

rf = RandomForestClassifier()
xgb = XGBClassifier()

voting_svm = SVC(probability=True)
voting_knn = KNeighborsClassifier()
voting_nb = MultinomialNB()
voting_rf = RandomForestClassifier()
voting_xgb = XGBClassifier()

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=[('voting_rf', voting_rf), 
                                            ('voting_xgb', voting_xgb), 
                                            ('voting_svm', voting_svm), 
                                            ('voting_knn', voting_knn), 
                                            ('voting_nb', voting_nb)], voting='soft')

stacking_svm = SVC(probability=True)
stacking_knn = KNeighborsClassifier()
stacking_nb = MultinomialNB()
stacking_rf = RandomForestClassifier()
stacking_xgb = XGBClassifier()

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=[('stacking_knn', stacking_knn), 
                                                ('stacking_rf', stacking_rf), 
                                                ('stacking_xgb', stacking_xgb), 
                                                ('stacking_nb', stacking_nb), 
                                                ('stacking_svm', stacking_svm)], cv=5)


In [None]:
#TUNE THE RANDOM FOREST
gs = GridSearchCV(estimator=rf,param_grid=rf_search,cv=5)
gs.fit(X_train_vec, y_train) # Fit the model grid search to the  training data
print("Best hyperparameters:", gs.best_params_)# Print the best hyperparameters found by the grid search
best_model = gs.best_estimator_ # Get the best model from the grid search
    
y_pred = best_model.predict(X_test_vec) #MAKE PREDICTIONS WITH THE TUNED MODEL USING THE TESTING SET
    
#PRINT OUT THE SCORE OF MODEL WITH THE DIFFERENT PERFORMANCE METRICS
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("PRECISION: ", metrics.precision_score(y_test, y_pred))
print("Recall: ", metrics.recall_score(y_test, y_pred))
print("F1 SCORE OF THE MODEL: ", metrics.f1_score(y_test, y_pred))

pickle.dump(best_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tuned_rf.pk", "wb")) #SAVE THE TUNED MODEL

In [None]:
models = [xgb,voting_model,stacking_model]
search_spaces = [xgb_search,voting_search, stacking_search]

In [None]:
#TUNE THE XGBOOST, VOTING, STACKING USING RANDOMIZEDSEARCHCV
for model, search_space in zip(models, search_space):
    
    rs = RandomizedSearchCV(estimator=model,param_distributions=search_space,n_iter=150,cv=5)
    rs.fit(X_train_vec, y_train) # Fit the model grid search to the  training data
    print("Best hyperparameters:", rs.best_params_)# Print the best hyperparameters found by the grid search
    best_model = rs.best_estimator_ # Get the best model from the grid search
    
    y_pred = best_model.predict(X_test_vec) #MAKE PREDICTIONS WITH THE TUNED MODEL USING THE TESTING SET
    
    modelname = type(model).__name__
    print(modelname)
    
    #PRINT OUT THE SCORE OF MODEL WITH THE DIFFERENT PERFORMANCE METRICS
    print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
    print("PRECISION: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1 SCORE OF THE MODEL: ", metrics.f1_score(y_test, y_pred))
    
    #SAVE THE TUNED MODELS
    if modelname == "XGBClassifier":
        pickle.dump(best_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tuned_xgb.pk", "wb"))
    if modelname == "VotingClassifier":
        pickle.dump(best_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tuned_voting.pk", "wb"))
    if modelname == "StackingClassifier":
        pickle.dump(best_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tuned_stacking.pk", "wb"))

# K-FOLD CROSS VALIDATION

In [None]:
df = pd.read_csv(r'D:\user\Thesis dataset (updated)\BALANCED BY KEYWORD\shuffled_balanced_full_suicide_ideation.csv')

In [None]:
tfidf = TfidfVectorizer(min_df=5)

#GET THE FEATURES
features = df["Tweet"]
#GET THE LABEL
label = df["Label"]

X, y = features, label

#TOKENIZE AND VECTORIZE THE TRAINING FEATURES
X_vec = tfidf.fit_transform(X)

In [None]:
rf = RandomForestClassifier() #MAKES USE OF THE BAGGING ENSEMBLE TECHNIQUE
xgb = XGBClassifier() #MAKES USE OF THE BOOSTING ENSEMBLE TECHNIQUE

voting_svm = SVC(probability=True)
voting_knn = KNeighborsClassifier()
voting_nb = MultinomialNB()
voting_rf = RandomForestClassifier()
voting_xgb = XGBClassifier()

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=[('voting_rf', voting_rf), 
                                            ('voting_xgb', voting_xgb), 
                                            ('voting_svm', voting_svm), 
                                            ('voting_knn', voting_knn), 
                                            ('voting_nb', voting_nb)], voting='soft')

stacking_svm = SVC(probability=True)
stacking_knn = KNeighborsClassifier()
stacking_nb = MultinomialNB()
stacking_rf = RandomForestClassifier()
stacking_xgb = XGBClassifier()

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=[('stacking_knn', stacking_knn), 
                                                ('stacking_rf', stacking_rf), 
                                                ('stacking_xgb', stacking_xgb), 
                                                ('stacking_nb', stacking_nb), 
                                                ('stacking_svm', stacking_svm)], cv=5)

In [None]:
models = [rf,xgb,voting_model,stacking_model]

In [None]:
# Initialize StratifiedKFold with the desired number of folds
skf = StratifiedKFold(n_splits=5)

# Performance metrics to use
scoring = ['accuracy', 'precision', 'recall', 'f1']
    
for model in models:
    results = cross_validate(model, X_vec, y, cv=skf, scoring=scoring)
    
    #PRINT MODEL NAME
    print(type(model).__name__)

    # Print the scores for each fold and the average scores
    print("Accuracy:", results['test_accuracy'])
    print("Precision:", results['test_precision'])
    print("Recall:", results['test_recall'])
    print("F1-score:", results['test_f1'])

    print(f"\nAverage Accuracy: {results['test_accuracy'].mean()}")
    print(f"Average Precision: {results['test_precision'].mean()}")
    print(f"Average Recall: {results['test_recall'].mean()}")
    print(f"Average F1-score: {results['test_f1'].mean()}")
    print("\n")

# TRAINING THE OTHER SETS OF MODELS

In [None]:
df = pd.read_csv(r'D:\user\Thesis dataset (updated)\BALANCED BY KEYWORD\shuffled_balanced_full_suicide_ideation.csv')

In [None]:
tfidf = TfidfVectorizer(min_df=5)

#GET THE FEATURES
features = df["Tweet"]
#GET THE LABEL
label = df["Label"]

X, y = features, label

#TOKENIZE AND VECTORIZE THE TRAINING FEATURES
X_vec = tfidf.fit_transform(X)

In [None]:
rf = RandomForestClassifier() #MAKES USE OF THE BAGGING ENSEMBLE TECHNIQUE
xgb = XGBClassifier() #MAKES USE OF THE BOOSTING ENSEMBLE TECHNIQUE

voting_svm = SVC(probability=True)
voting_knn = KNeighborsClassifier()
voting_nb = MultinomialNB()
voting_rf = RandomForestClassifier()
voting_xgb = XGBClassifier()

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=[('voting_rf', voting_rf), 
                                            ('voting_xgb', voting_xgb), 
                                            ('voting_svm', voting_svm), 
                                            ('voting_knn', voting_knn), 
                                            ('voting_nb', voting_nb)], voting='soft')

stacking_svm = SVC(probability=True)
stacking_knn = KNeighborsClassifier()
stacking_nb = MultinomialNB()
stacking_rf = RandomForestClassifier()
stacking_xgb = XGBClassifier()

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=[('stacking_knn', stacking_knn), 
                                                ('stacking_rf', stacking_rf), 
                                                ('stacking_xgb', stacking_xgb), 
                                                ('stacking_nb', stacking_nb), 
                                                ('stacking_svm', stacking_svm)], cv=5)

rf.fit(X_vec, y) #TRAIN RANDOM FOREST MODEL
xgb.fit(X_vec, y) #TRAIN XGBOOST MODEL
voting_model.fit(X_vec,y) #TRAIN VOTING MODEL
stacking_model.fit(X_vec,y) #TRAIN STACKING MODEL

pickle.dump(tfidf, open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tfidf_vocab_17k.pkl", "wb"))
pickle.dump(rf,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\rf_demo_17k.pk", "wb"))
pickle.dump(xgb,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\xgb_demo_17k.pk", "wb"))
pickle.dump(voting_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\voting_demo_17k.pk", "wb"))
pickle.dump(stacking_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\stacking_demo_17k.pk", "wb"))

In [None]:
df = pd.read_csv(r'D:\user\Thesis dataset (updated)\FULL DATASET\shuffled_cleaned_labeled_suicide_ideation.csv')

In [None]:
tfidf = TfidfVectorizer(min_df=5)

#GET THE FEATURES
features = df["Tweet"]
#GET THE LABEL
label = df["Label"]

X, y = features, label

#TOKENIZE AND VECTORIZE THE FEATURES
X_vec = tfidf.fit_transform(X)

In [None]:
rf = RandomForestClassifier() #MAKES USE OF THE BAGGING ENSEMBLE TECHNIQUE
xgb = XGBClassifier() #MAKES USE OF THE BOOSTING ENSEMBLE TECHNIQUE

voting_svm = SVC(probability=True)
voting_knn = KNeighborsClassifier()
voting_nb = MultinomialNB()
voting_rf = RandomForestClassifier()
voting_xgb = XGBClassifier()

# Define the voting ensemble model
voting_model = VotingClassifier(estimators=[('voting_rf', voting_rf), 
                                            ('voting_xgb', voting_xgb), 
                                            ('voting_svm', voting_svm), 
                                            ('voting_knn', voting_knn), 
                                            ('voting_nb', voting_nb)], voting='soft')

stacking_svm = SVC(probability=True)
stacking_knn = KNeighborsClassifier()
stacking_nb = MultinomialNB()
stacking_rf = RandomForestClassifier()
stacking_xgb = XGBClassifier()

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=[('stacking_knn', stacking_knn), 
                                                ('stacking_rf', stacking_rf), 
                                                ('stacking_xgb', stacking_xgb), 
                                                ('stacking_nb', stacking_nb), 
                                                ('stacking_svm', stacking_svm)], cv=5)

rf.fit(X_vec, y) #TRAIN RANDOM FOREST MODEL
xgb.fit(X_vec, y) #TRAIN XGBOOST MODEL
voting_model.fit(X_vec,y) #TRAIN VOTING MODEL
stacking_model.fit(X_vec,y) #TRAIN STACKING MODEL

pickle.dump(tfidf, open(r"C:\Users\user\Presentation\Trained Models\Demo Models\tfidf_vocab_60k.pkl", "wb"))
pickle.dump(rf,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\rf_demo_60k.pk", "wb"))
pickle.dump(xgb,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\xgb_demo_60k.pk", "wb"))
pickle.dump(voting_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\voting_demo_60k.pk", "wb"))
pickle.dump(stacking_model,open(r"C:\Users\user\Presentation\Trained Models\Demo Models\stacking_demo_60k.pk", "wb"))