# Import CSV

In [1]:
import pandas as pd
df = pd.read_csv('DSC 288 Raw Reddit Dataset.csv')
print(df)

                                                   Title  \
0      Your anger may be caused by a magnesium defici...   
1      Do you ever get so angry that you wish to kill...   
2      The thing is that when your emotional distress...   
3      Does anyone else just avoid people because you...   
4      Was anyone else “not allowed” to get upset gro...   
...                                                  ...   
12946                                 Looking for advice   
12947                        New panic attack experience   
12948  I’m having pretty bad suicidal thoughts, think...   
12949      Anxiety symptoms replicating low blood sugar?   
12950  I've had a couple of panic attacks because of ...   

                                                    Text  Score  Topic  
0      I tried absolutely everything to fix my chroni...    743  anger  
1      I don't know if it's my anger issues talking o...    692  anger  
2                                                    NaN    

# Text Preprocessing

In [2]:
#Check for null values
df.isnull().sum()

Unnamed: 0,0
Title,0
Text,3814
Score,0
Topic,0


In [3]:
#Some reddit posts are just titles and no body text. For consistency, we will remove them
df.dropna(inplace=True)

print(df.shape)

(9137, 4)


In [4]:
#Normalizing
import re
import string
from bs4 import BeautifulSoup

#following https://www.geeksforgeeks.org/text-preprocessing-for-nlp-tasks/#text-preprocessing-technique-in-nlp
  #https://codefinity.com/blog/A-Comprehensive-Guide-to-Text-Preprocessing-with-NLTK

In [5]:
#Lowercase
print(df.loc[0])
df['Title_pro'] = df['Title'].str.lower()
df['Text_pro'] = df['Text'].str.lower()
print(df.loc[0])

Title    Your anger may be caused by a magnesium defici...
Text     I tried absolutely everything to fix my chroni...
Score                                                  743
Topic                                                anger
Name: 0, dtype: object
Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      743
Topic                                                    anger
Title_pro    your anger may be caused by a magnesium defici...
Text_pro     i tried absolutely everything to fix my chroni...
Name: 0, dtype: object


In [6]:
#Remove punctuation
print(df.loc[12947])
df['Title_pro'] = df['Title_pro'].str.translate(str.maketrans('', '', string.punctuation))
df['Text_pro'] = df['Text_pro'].str.translate(str.maketrans('', '', string.punctuation))
print(df.loc[12947])

Title                              New panic attack experience
Text         Hi guys, I just wanted to see if anyone has ex...
Score                                                       12
Topic                                                    panic
Title_pro                          new panic attack experience
Text_pro     hi guys, i just wanted to see if anyone has ex...
Name: 12947, dtype: object
Title                              New panic attack experience
Text         Hi guys, I just wanted to see if anyone has ex...
Score                                                       12
Topic                                                    panic
Title_pro                          new panic attack experience
Text_pro     hi guys i just wanted to see if anyone has exp...
Name: 12947, dtype: object


In [7]:
#Remove Emojis and Special characters
print(df.loc[12947])
df['Title_pro'] = df['Title_pro'].apply(lambda x: ''.join(char for char in x if char.isalnum() or char.isspace()))
df['Text_pro'] = df['Text_pro'].apply(lambda x: ''.join(char for char in x if char.isalnum() or char.isspace()))
print(df.loc[12947])

Title                              New panic attack experience
Text         Hi guys, I just wanted to see if anyone has ex...
Score                                                       12
Topic                                                    panic
Title_pro                          new panic attack experience
Text_pro     hi guys i just wanted to see if anyone has exp...
Name: 12947, dtype: object
Title                              New panic attack experience
Text         Hi guys, I just wanted to see if anyone has ex...
Score                                                       12
Topic                                                    panic
Title_pro                          new panic attack experience
Text_pro     hi guys i just wanted to see if anyone has exp...
Name: 12947, dtype: object


In [8]:
#Remove numbers
print(df.loc[16])
df['Title_pro'] = df['Title_pro'].str.replace('\d+', '', regex=True)
df['Text_pro'] = df['Text_pro'].str.replace('\d+', '', regex=True)
print(df.loc[16])

Title            Punched my boss in the face, broke his nose. 
Text         I (30M) punched my boss (45ishM) square in the...
Score                                                      198
Topic                                                    anger
Title_pro          punched my boss in the face broke his nose 
Text_pro     i 30m punched my boss 45ishm square in the fac...
Name: 16, dtype: object
Title            Punched my boss in the face, broke his nose. 
Text         I (30M) punched my boss (45ishM) square in the...
Score                                                      198
Topic                                                    anger
Title_pro          punched my boss in the face broke his nose 
Text_pro     i m punched my boss ishm square in the face kn...
Name: 16, dtype: object


# Advanced Preprocessing

In [9]:
#Advanced Preprocessing
!pip install nltk
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
#Tokenizing
print(df.loc[0])
df['Title_pro'] = df['Title_pro'].apply(nltk.word_tokenize)
df['Text_pro'] = df['Text_pro'].apply(nltk.word_tokenize)
print(df.loc[0])

Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      743
Topic                                                    anger
Title_pro    your anger may be caused by a magnesium defici...
Text_pro     i tried absolutely everything to fix my chroni...
Name: 0, dtype: object
Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      743
Topic                                                    anger
Title_pro    [your, anger, may, be, caused, by, a, magnesiu...
Text_pro     [i, tried, absolutely, everything, to, fix, my...
Name: 0, dtype: object


In [11]:
#Remove Stop Words
print(df.loc[0])
stop_words = set(stopwords.words('english'))
df['Title_pro'] = df['Title_pro'].apply(lambda x: [word for word in x if word not in stop_words])
df['Text_pro'] = df['Text_pro'].apply(lambda x: [word for word in x if word not in stop_words])
print(df.loc[0])

Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      743
Topic                                                    anger
Title_pro    [your, anger, may, be, caused, by, a, magnesiu...
Text_pro     [i, tried, absolutely, everything, to, fix, my...
Name: 0, dtype: object
Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      743
Topic                                                    anger
Title_pro    [anger, may, caused, magnesium, deficiency, ne...
Text_pro     [tried, absolutely, everything, fix, chronic, ...
Name: 0, dtype: object


In [12]:
#Lemmitization
print(df.loc[20])
lemmatizer = WordNetLemmatizer()
df['Title_pro'] = df['Title_pro'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['Text_pro'] = df['Text_pro'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.loc[20])

Title                “Anger is the part of you that loves you”
Text         After hearing this, it really changed the way ...
Score                                                      179
Topic                                                    anger
Title_pro                                 [anger, part, loves]
Text_pro     [hearing, really, changed, way, view, anger, m...
Name: 20, dtype: object
Title                “Anger is the part of you that loves you”
Text         After hearing this, it really changed the way ...
Score                                                      179
Topic                                                    anger
Title_pro                                  [anger, part, love]
Text_pro     [hearing, really, changed, way, view, anger, m...
Name: 20, dtype: object


In [13]:
#Convert back into string. Makes issues when extracting a list from a csv
df['Title_pro'] = df['Title_pro'].apply(lambda x: ' '.join(x))
df['Text_pro'] = df['Text_pro'].apply(lambda x: ' '.join(x))
print(df['Title_pro'])

0        anger may caused magnesium deficiency neuron l...
1                          ever get angry wish kill moment
3          anyone else avoid people know theyll make angry
4                    anyone else allowed get upset growing
5        wish people understood outburst dont happen sm...
                               ...                        
12945       fastest way ive found get panic attack control
12946                                       looking advice
12947                          new panic attack experience
12948    im pretty bad suicidal thought thinking cant h...
12949          anxiety symptom replicating low blood sugar
Name: Title_pro, Length: 9137, dtype: object


In [18]:
#Remove any empty rows
new_df = df[df['Title'].str.contains('1984')]
print(new_df)

    Title                                               Text  Score  Topic  \
112  1984  Anyone else feel like we're just completely at...     58  anger   

    Title_pro                                           Text_pro  
112            anyone else feel like completely mercy soulles...  


In [19]:
print(df.shape)
df = df[df['Title_pro'].str.len() > 0]
df = df[df['Text_pro'].str.len() > 0]
print(df.shape)

(9137, 6)
(9078, 6)


In [20]:
#Create a combined title and text column
df['T_T_pro'] = df['Title_pro'] +" "+ df['Text_pro']
print(df['T_T_pro'])

0        anger may caused magnesium deficiency neuron l...
1        ever get angry wish kill moment dont know ange...
3        anyone else avoid people know theyll make angr...
4        anyone else allowed get upset growing always s...
5        wish people understood outburst dont happen sm...
                               ...                        
12945    fastest way ive found get panic attack control...
12946    looking advice hi im f ive professionally diag...
12947    new panic attack experience hi guy wanted see ...
12948    im pretty bad suicidal thought thinking cant h...
12949    anxiety symptom replicating low blood sugar sc...
Name: T_T_pro, Length: 9078, dtype: object


# Save to New CSV

In [21]:
df.to_csv('DSC 288 Processed Reddit Dataset.csv', index=False)