In [20]:
#import necessary libraries

import pandas as pd
import numpy as np
import warnings
import nltk
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from tabulate import tabulate

warnings.filterwarnings('ignore')

In [22]:
#download nltk libraries
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
word_lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
#Import dataset
dataset = pd.read_csv('suicide.csv')

#split dataset into three
split_dataset = np.array_split(dataset,3)
dataset = split_dataset[0]

#drop "Unnamed column from dataset"
dataset = dataset.drop('Unnamed: 0', axis=1)
dataset.head(10)


Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
5,Honetly idkI dont know what im even doing here...,suicide
6,[Trigger warning] Excuse for self inflicted bu...,suicide
7,It ends tonight.I can’t do it anymore. \nI quit.,suicide
8,"Everyone wants to be ""edgy"" and it's making me...",non-suicide
9,My life is over at 20 years oldHello all. I am...,suicide


#### Data Cleaning: Removing Stopwords,Tokenization,Normalization

In [4]:
#Find missing values in the dataset
dataset.isnull().sum()

text     0
class    0
dtype: int64

In [23]:
#Remove stopwords
dataset['text'] = dataset['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in set(stopwords)]))

In [6]:
X = dataset.drop('class', axis=1)
y = dataset['class']

In [7]:
# To remove emails
email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
regex_to_remove = [email_regex, r'Subject:', r'Re:']

for i in range(0, len(X)):
    # removing all special character
    review = re.sub('[^a-zA-Z]', ' ', str(X['text'][i]))
    
    #remove all urls
    review = re.sub(r"http?[A-Za-z0-9]+", "", str(X['text'][i]))

    #timestamps and date
    review = re.sub(r'(2[0-3]|[01][0-9]|[0-9]):([0-5][0-9]|[0-9]):([0-5][0-9]|[0-9])',"",str(X['text'][i]))
    review = re.sub(r"^([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])$","",str(X['text'][i]))


   # make document as lowerCase
    review = review.lower()
    # splitting the documents into words for ex ['iam', 'omar']
    review = review.split()
    # perfrom  lemmatization 
    review = [word_lemmatizer.lemmatize(word) for word in review if not word in stop_words]
    # join the document again
    review = ' '.join(review)
    
    # removing emails
    for r in regex_to_remove:
        X['text'][i] = re.sub(r, '', review)

### Split Dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Vectorization


Perform Tf-IDF and Tf-IDF with 2 grams of words

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer_n12 = TfidfVectorizer(max_features=10000, ngram_range=(1,2))


X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['text'])
X_tfidf_test = tfidf_vectorizer.transform(X_test['text'])

X_tfidf_train_n12= tfidf_vectorizer_n12.fit_transform(X_train['text'])
X_tfidf_test_n12=tfidf_vectorizer_n12.transform(X_test['text'])

### Bag of Words

In [10]:
vectorizer = CountVectorizer()
  
X_bow_train = vectorizer.fit_transform(X_train['text'])
X_bow_test = vectorizer.transform(X_test['text'])

### VADER Sentiment Analyzer 

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

def get_vader_scores(data):
    sid = SIA()
    vader_df=data.copy()
    vader_df['scores'] = vader_df['text'].apply(lambda txt: sid.polarity_scores(str(txt)))
    
    vader_df['neg_score'] = vader_df['scores'].apply(lambda txt: txt['neg'])
    vader_df['neu_score'] =vader_df['scores'].apply(lambda txt: txt['neu'])
    vader_df['pos_score'] = vader_df['scores'].apply(lambda txt: txt['pos'])
    vader_df['compound'] = vader_df['scores'].apply(lambda txt: txt['compound'])
    vader_df.drop('scores', axis=1, inplace=True)
    vader_df.drop('text', axis=1, inplace=True)
    return vader_df

In [12]:
X_vader_train = get_vader_scores(X_train)
X_vader_test= get_vader_scores(X_test)

### Model Fitting

##### Feature Selection : Logistic Regression

In [13]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

# We Can select any model but linearSVC has l1 norm penality which deals with sparse
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_tfidf_train, y_train)

# This function select the best features that has high weigh
fs = SelectFromModel(lsvc, prefit=True)
# This function redeuce X to the selected features
X_selection = fs.transform(X_tfidf_train)
X_test_selection = fs.transform(X_tfidf_test)


lsvc.fit(X_tfidf_train_n12, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_n12 = fs_n12.transform(X_tfidf_train_n12)
X_test_selection_n12 = fs_n12.transform(X_tfidf_test_n12)

lsvc.fit(X_bow_train, y_train)
fs_n12 = SelectFromModel(lsvc, prefit=True)
X_selection_bow = fs_n12.transform(X_bow_train)
X_test_selection_bow = fs_n12.transform(X_bow_test)

In [14]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_selection, y_train)
y_predict_tfidf = lsvc.predict(X_test_selection)

lsvc.fit(X_selection_n12,y_train)
y_predict_tfidf_n12 = lsvc.predict(X_test_selection_n12)

lsvc.fit(X_selection_bow,y_train)
y_predict_bow = lsvc.predict(X_test_selection_bow)

lsvc.fit(X_vader_train,y_train)
y_predict_vader = lsvc.predict(X_vader_test)

linear_svm_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf)
linear_svm_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_n12)
linear_svm_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow)
vader_svm_results=metrics.precision_recall_fscore_support(y_test, y_predict_vader)

In [15]:
tfidf_acc= metrics.accuracy_score(y_test, y_predict_tfidf)
tfidf_n12_acc=accuracy_score(y_test, y_predict_tfidf_n12)
bow_acc= accuracy_score(y_test, y_predict_bow)
vader_acc=accuracy_score(y_test, y_predict_vader)

In [16]:
data1 = [['TF-IDF','TF-IDF 2-grams ','bag of words','vader'],
         ['precision',linear_svm_tfidf_results[0][0],linear_svm_tfidf_n12_results[0][0],linear_svm_bow_results[0][0],
          vader_svm_results[0][0]],
         ['recall',linear_svm_tfidf_results[1][0],linear_svm_tfidf_n12_results[1][0],linear_svm_bow_results[1][0],
          vader_svm_results[1][0]],
         ['F1-score',linear_svm_tfidf_results[2][0],linear_svm_tfidf_n12_results[2][0],linear_svm_bow_results[2][0],
          vader_svm_results[2][0]],
        ['accuracy',tfidf_acc,tfidf_n12_acc,bow_acc,
          vader_acc]]

In [19]:
print(tabulate(data1,headers='firstrow',tablefmt='fancy_grid'))

╒═══════════╤══════════╤═══════════════════╤════════════════╤══════════╕
│           │   TF-IDF │   TF-IDF 2-grams  │   bag of words │    vader │
╞═══════════╪══════════╪═══════════════════╪════════════════╪══════════╡
│ precision │ 0.865235 │          0.866395 │       0.866266 │ 0.672957 │
├───────────┼──────────┼───────────────────┼────────────────┼──────────┤
│ recall    │ 0.850177 │          0.843629 │       0.881192 │ 0.688205 │
├───────────┼──────────┼───────────────────┼────────────────┼──────────┤
│ F1-score  │ 0.857639 │          0.854861 │       0.873665 │ 0.680496 │
├───────────┼──────────┼───────────────────┼────────────────┼──────────┤
│ accuracy  │ 0.858842 │          0.85673  │       0.872544 │ 0.676792 │
╘═══════════╧══════════╧═══════════════════╧════════════════╧══════════╛


### Random Classfier

In [24]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_selection, y_train)
y_predict_tfidf_2 = clf.predict(X_test_selection)

clf.fit(X_selection_n12, y_train)
y_predict_tfidf_n12_2 = clf.predict(X_test_selection_n12)

clf.fit(X_selection_bow, y_train)
y_predict_bow_2 = clf.predict(X_test_selection_bow)

clf.fit(X_vader_train, y_train)
y_predict_vader_2 = clf.predict(X_vader_test)

In [25]:
RandomForest_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_2)
RandomForest_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_n12_2)
RandomForest_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_2)
RandomForest_vader_results=metrics.precision_recall_fscore_support(y_test, y_predict_vader_2)

In [26]:
RandomForest_tfidf_acc= metrics.accuracy_score(y_test, y_predict_tfidf_2)
RandomForest_tfidf_n12_acc=accuracy_score(y_test, y_predict_tfidf_n12_2)
RandomForest_bow_acc= accuracy_score(y_test, y_predict_bow_2)
RandomForest_vader_acc=accuracy_score(y_test, y_predict_vader_2)

In [27]:
data2 = [['TF-IDF','TF-IDF 2-grams ','bag of words','vader'],
         ['precision',RandomForest_tfidf_results[0][0],RandomForest_tfidf_n12_results[0][0],RandomForest_bow_results[0][0],
          RandomForest_vader_results[0][0]],
         ['recall',RandomForest_tfidf_results[1][0],RandomForest_tfidf_n12_results[1][0],RandomForest_bow_results[1][0],
          RandomForest_vader_results[1][0]],
         ['F1-score',RandomForest_tfidf_results[2][0],RandomForest_tfidf_n12_results[2][0],RandomForest_bow_results[2][0],
          RandomForest_vader_results[2][0]],
        ['accuracy',RandomForest_tfidf_acc,RandomForest_tfidf_n12_acc, RandomForest_bow_acc,
          RandomForest_vader_acc]]

In [28]:
print(tabulate(data2,headers='firstrow',tablefmt='fancy_grid'))

╒═══════════╤══════════╤═══════════════════╤════════════════╤══════════╕
│           │   TF-IDF │   TF-IDF 2-grams  │   bag of words │    vader │
╞═══════════╪══════════╪═══════════════════╪════════════════╪══════════╡
│ precision │ 0.790608 │          0.794895 │       0.780578 │ 0.769284 │
├───────────┼──────────┼───────────────────┼────────────────┼──────────┤
│ recall    │ 0.903679 │          0.909537 │       0.907211 │ 0.768157 │
├───────────┼──────────┼───────────────────┼────────────────┼──────────┤
│ F1-score  │ 0.843371 │          0.848361 │       0.839144 │ 0.76872  │
├───────────┼──────────┼───────────────────┼────────────────┼──────────┤
│ accuracy  │ 0.832127 │          0.837384 │       0.826051 │ 0.76883  │
╘═══════════╧══════════╧═══════════════════╧════════════════╧══════════╛


### Ensemble Methods

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression,  SGDClassifier
from sklearn.svm import SVC

In [30]:
log_clf = LogisticRegression(solver="lbfgs")
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma="scale", probability=True)
sgd = SGDClassifier(alpha=.0001, max_iter=50, loss='log',
                                       penalty="elasticnet", n_jobs=-1)

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft')

In [31]:
voting_clf.fit(X_selection, y_train)
y_predict_tfidf_4 = voting_clf.predict(X_test_selection)

voting_clf.fit(X_selection_n12, y_train)
y_predict_tfidf_n12_4 = voting_clf.predict(X_test_selection_n12)

voting_clf.fit(X_selection_bow, y_train)
y_predict_bow_4 = voting_clf.predict(X_test_selection_bow)

voting_clf.fit(X_vader_train, y_train)
y_predict_vader_4 = voting_clf.predict(X_vader_test)

In [None]:
en_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_4)
en_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_predict_tfidf_n12_4)
en_bow_results=metrics.precision_recall_fscore_support(y_test, y_predict_bow_4)
en_vader_results=metrics.precision_recall_fscore_support(y_test, y_predict_vader_4)

In [None]:
en_tfidf_acc= metrics.accuracy_score(y_test, y_predict_tfidf_4)
en_tfidf_n12_acc=accuracy_score(y_test, y_predict_tfidf_n12_4)
en_bow_acc= accuracy_score(y_test, y_predict_bow_4)
en_vader_acc=accuracy_score(y_test, y_predict_vader_4)

In [None]:
data3= [['TF-IDF','TF-IDF 2-grams ','bag of words','vader'],
        ['precision',en_tfidf_results[0][0],en_tfidf_n12_results[0][0],en_bow_results[0][0],
          en_vader_results[0][0]],
         ['recall',en_tfidf_results[1][0],en_tfidf_n12_results[1][0],en_bow_results[1][0],
          en_vader_results[1][0]],
         ['F1-score',en_tfidf_results[2][0],en_tfidf_n12_results[2][0],en_bow_results[2][0],
          en_vader_results[2][0]],
       ['accuracy',en_tfidf_acc,en_tfidf_n12_acc, en_bow_acc,
          en_vader_acc]]

In [None]:
print(tabulate(data3,headers='firstrow',tablefmt='fancy_grid'))

### Bagging Methods

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_selection, y_train)
y_pred_5 = bag_clf.predict(X_test_selection)

bag_clf.fit(X_selection_n12, y_train)
y_pred_n12_5 = bag_clf.predict(X_test_selection_n12)

bag_clf.fit(X_selection_bow, y_train)
y_pred_bow_5 = bag_clf.predict(X_test_selection_bow)

bag_clf.fit(X_vader_train, y_train)
y_pred_vader_5 = bag_clf.predict(X_vader_test)

In [None]:
bag_tfidf_results=metrics.precision_recall_fscore_support(y_test, y_pred_5)
bag_tfidf_n12_results=metrics.precision_recall_fscore_support(y_test, y_pred_n12_5)
bag_bow_results=metrics.precision_recall_fscore_support(y_test, y_pred_bow_5)
bag_vader_results=metrics.precision_recall_fscore_support(y_test, y_pred_vader_5)

In [None]:
bag_tfidf_acc= metrics.accuracy_score(y_test, y_pred_5)
bag_tfidf_n12_acc=accuracy_score(y_test, y_pred_n12_5)
bag_bow_acc= accuracy_score(y_test, y_pred_bow_5)
bag_vader_acc=accuracy_score(y_test, y_pred_vader_5)

In [None]:
data4= [['TF-IDF','TF-IDF 2-grams ','bag of words','vader'],
        ['precision',bag_tfidf_results[0][0],bag_tfidf_n12_results[0][0],bag_bow_results[0][0],
          bag_vader_results[0][0]],
         ['recall',bag_tfidf_results[1][0],bag_tfidf_n12_results[1][0],bag_bow_results[1][0],
          bag_vader_results[1][0]],
         ['F1-score',bag_tfidf_results[2][0],bag_tfidf_n12_results[2][0],bag_bow_results[2][0],
          bag_vader_results[2][0]],
        ['accuracy',bag_tfidf_acc,bag_tfidf_n12_acc, bag_bow_acc,
          bag_vader_acc]]

In [None]:
print(tabulate(data4,headers='firstrow',tablefmt='fancy_grid'))

### Obtaining Tweets form Twitter

In [1]:
#load environment variable

from dotenv import load_dotenv
load_dotenv()

True

In [29]:
from tweepy import OAuthHandler
import tweepy
import pandas as pd
import matplotlib.pyplot as plt

In [82]:
#keys
import os

consumer_key= "put your key here"
consumer_secret="put your key here"
access_token="put your key here"
access_token_secret="put your key here"
bearer_token = "put your key here"

In [83]:
client = tweepy.Client( bearer_token=bearer_token, consumer_key=consumer_key, consumer_secret=consumer_secret, access_token=access_token, access_token_secret=access_token_secret,  wait_on_rate_limit=True)

In [84]:
#Passing in my twitter API authentication keys
auth = tweepy.OAuth1UserHandler(consumer_key, consumer_secret,access_token, access_token_secret)

In [85]:
#Instantiating the tweepy API
api = tweepy.API(auth)

In [144]:
#Query
query = 'better off without me'

In [145]:
#set time frame

#from 2016
start_time = '2016-01-01T00:00:00Z'

#to 2021
end_time = '2021-12-31-01T00:00:00Z'

In [146]:
# obtain maximum 100 tweets for query for thepast 7 days
response =  client.search_recent_tweets(query=query,expansions=['author_id'], max_results = 100, user_fields=["location","username","id"], place_fields = ["country"], tweet_fields=['created_at',"lang","text"])

In [147]:
#extract names, usernames, tweet_ids and tweets from the response
usernames = []
tweets = []
location =[]

for tweet in response.data:
    for details in response.includes['users']:
        usernames.append( details.username)
        location.append(details.location)
        tweets.append(tweet.text)

In [148]:
#Creating a pandas dataframe for the obtained results
tweet_df = pd.DataFrame(columns=["User_ID", "Name", "Username", "Profile_Image_URL", "Tweet_ID","Language","Tweet"])
tweet_df["Username"] = usernames
tweet_df["Tweet"] = tweets
tweet_df['Location'] = location

tweet_df

Unnamed: 0,User_ID,Name,Username,Profile_Image_URL,Tweet_ID,Language,Tweet,Location
0,,,SJKHC83,,,,I still have feelings for you &amp; no matter ...,
1,,,YOUHAVENONIPS,,,,I still have feelings for you &amp; no matter ...,
2,,,cupidflu,,,,I still have feelings for you &amp; no matter ...,honami . . . project sekai!
3,,,jbwhittaker,,,,I still have feelings for you &amp; no matter ...,"Strongsville, OH"
4,,,adhdistic,,,,I still have feelings for you &amp; no matter ...,hd. zzipa tumblr
...,...,...,...,...,...,...,...,...
8923,,,regretfulbot,,,,I don't know if anyone considers me to be a fr...,quotebot
8924,,,harmmms,,,,I don't know if anyone considers me to be a fr...,everywhere
8925,,,iamstephyg,,,,I don't know if anyone considers me to be a fr...,"Los Angeles, CA"
8926,,,watcherskyduo,,,,I don't know if anyone considers me to be a fr...,©pouistired


In [149]:
tweets_unique = pd.unique(tweet_df['Tweet'])

In [150]:
tweets_unique_df = pd.DataFrame(tweets_unique, columns=["Tweets"])

In [151]:
suicide_tweets = tweets_unique_df.to_csv("tweets_phrase_better_off_without_me.csv")

### Predictions

In [None]:
#Run the program
if __name__ == "__main__":
    
    #fit data to a naive bayes classifier
    classifier = RandomForestClassifier.train(X_train)

    # the first tweet in our dataframe
    example_tweet = tweet_df["Tweet"].values[0]

    #predict
    print((example_tweet,classifier.classify(dict([token, True] for token in tweet_df))))
    

