In [1]:
import pandas as pd
# NLTK Lib
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abedk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abedk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#reading the data
data = pd.read_csv("reddit.csv",encoding="ISO-8859-1")
data.head()

Unnamed: 0,text,label
0,I recently went through a breakup and she said...,depression
1,"I do not know how to navigate these feelings, ...",depression
2,"So I have been with my bf for 5 months , and h...",depression
3,I am so exhausted of this. Just when I think I...,SuicideWatch
4,I have been severly bullied since i was 5 till...,depression


In [3]:
#checking the count of the dependent variable
data['label'].value_counts()

depression      10371
SuicideWatch     9992
Name: label, dtype: int64

## Remove Punctuation

In [4]:
#library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
data['clean_msg']= data['text'].apply(lambda x:remove_punctuation(x))
data.head()

Unnamed: 0,text,label,clean_msg
0,I recently went through a breakup and she said...,depression,I recently went through a breakup and she said...
1,"I do not know how to navigate these feelings, ...",depression,I do not know how to navigate these feelings n...
2,"So I have been with my bf for 5 months , and h...",depression,So I have been with my bf for 5 months and he...
3,I am so exhausted of this. Just when I think I...,SuicideWatch,I am so exhausted of this Just when I think I ...
4,I have been severly bullied since i was 5 till...,depression,I have been severly bullied since i was 5 till...


## Lower

In [6]:
data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,text,label,clean_msg,msg_lower
0,I recently went through a breakup and she said...,depression,I recently went through a breakup and she said...,i recently went through a breakup and she said...
1,"I do not know how to navigate these feelings, ...",depression,I do not know how to navigate these feelings n...,i do not know how to navigate these feelings n...
2,"So I have been with my bf for 5 months , and h...",depression,So I have been with my bf for 5 months and he...,so i have been with my bf for 5 months and he...
3,I am so exhausted of this. Just when I think I...,SuicideWatch,I am so exhausted of this Just when I think I ...,i am so exhausted of this just when i think i ...
4,I have been severly bullied since i was 5 till...,depression,I have been severly bullied since i was 5 till...,i have been severly bullied since i was 5 till...


## Tokenization

In [7]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
data['msg_tokenied']= data['msg_lower'].apply(lambda x: tokenization(x))
data['msg_tokenied'][0]

['i recently went through a breakup and she said she still wants to be friends so i said i can try doing that but when she talks to me about things it always hurts i just want to lose feelings so all this pain can stop it hurts so much and i cannot even cry about it i do not want to hurt her because she said she does not want to never speak to me again but i do not know what to do here when we were together she always hurt me so i do not know why i still love her i wish we never met it would be much less painful how do i lose feelings']

## Removing Stop Words

In [8]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [9]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [10]:
#applying the function
data['no_stopwords']= data['msg_tokenied'].apply(lambda x:remove_stopwords(x))

## Stemming

In [11]:
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

In [12]:
#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
data['msg_stemmed']=data['no_stopwords'].apply(lambda x: stemming(x))

## Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [14]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
data['msg_lemmatized']=data['no_stopwords'].apply(lambda x:lemmatizer(x))

In [15]:
data['msg_lemmatized']

0        [i recently went through a breakup and she sai...
1        [i do not know how to navigate these feelings ...
2        [so i have been with my bf for 5 months  and h...
3        [i am so exhausted of this just when i think i...
4        [i have been severly bullied since i was 5 til...
                               ...                        
20358    [i took 50mg of seroquel a few hours after i d...
20359    [that is what has happened to me last week and...
20360    [ever just feel alone in a house full of peopl...
20361    [politicians neighbors corporations society cu...
20362    [i feel like i am just existing but for what i...
Name: msg_lemmatized, Length: 20363, dtype: object