# Import CSV

In [106]:
import pandas as pd
df = pd.read_csv('DSC 288 Raw Reddit Dataset.csv')
print(df)

                                                   Title  \
0      Your anger may be caused by a magnesium defici...   
1      Do you ever get so angry that you wish to kill...   
2      The thing is that when your emotional distress...   
3      Does anyone else just avoid people because you...   
4      Was anyone else “not allowed” to get upset gro...   
...                                                  ...   
12944                                Anxiety and bruxism   
12945                                      im giving up    
12946                                   holding breath?    
12947                                       I’m curious!   
12948                           How to break the cycle?    

                                                    Text  Score  Topic  
0      I tried absolutely everything to fix my chroni...    741  anger  
1      I don't know if it's my anger issues talking o...    680  anger  
2                                                    NaN    

# Text Preprocessing

In [107]:
#Check for null values
df.isnull().sum()

Unnamed: 0,0
Title,0
Text,3836
Score,0
Topic,0


In [108]:
#Some reddit posts are just titles and no body text. For consistency, we will remove them
df.dropna(inplace=True)

print(df.shape)

(9113, 4)


In [109]:
#Normalizing
import re
import string
from bs4 import BeautifulSoup

#following https://www.geeksforgeeks.org/text-preprocessing-for-nlp-tasks/#text-preprocessing-technique-in-nlp
  #https://codefinity.com/blog/A-Comprehensive-Guide-to-Text-Preprocessing-with-NLTK

In [110]:
#Lowercase
print(df.loc[0])
df['Title_pro'] = df['Title'].str.lower()
df['Text_pro'] = df['Text'].str.lower()
print(df.loc[0])

Title    Your anger may be caused by a magnesium defici...
Text     I tried absolutely everything to fix my chroni...
Score                                                  741
Topic                                                anger
Name: 0, dtype: object
Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      741
Topic                                                    anger
Title_pro    your anger may be caused by a magnesium defici...
Text_pro     i tried absolutely everything to fix my chroni...
Name: 0, dtype: object


In [111]:
#Remove punctuation
print(df.loc[12947])
df['Title_pro'] = df['Title_pro'].str.translate(str.maketrans('', '', string.punctuation))
df['Text_pro'] = df['Text_pro'].str.translate(str.maketrans('', '', string.punctuation))
print(df.loc[12947])

Title                                             I’m curious!
Text         I’m curious what age range and genders are of ...
Score                                                       10
Topic                                                    panic
Title_pro                                         i’m curious!
Text_pro     i’m curious what age range and genders are of ...
Name: 12947, dtype: object
Title                                             I’m curious!
Text         I’m curious what age range and genders are of ...
Score                                                       10
Topic                                                    panic
Title_pro                                          i’m curious
Text_pro     i’m curious what age range and genders are of ...
Name: 12947, dtype: object


In [112]:
#Remove Emojis and Special characters
print(df.loc[12947])
df['Title_pro'] = df['Title_pro'].apply(lambda x: ''.join(char for char in x if char.isalnum() or char.isspace()))
df['Text_pro'] = df['Text_pro'].apply(lambda x: ''.join(char for char in x if char.isalnum() or char.isspace()))
print(df.loc[12947])

Title                                             I’m curious!
Text         I’m curious what age range and genders are of ...
Score                                                       10
Topic                                                    panic
Title_pro                                          i’m curious
Text_pro     i’m curious what age range and genders are of ...
Name: 12947, dtype: object
Title                                             I’m curious!
Text         I’m curious what age range and genders are of ...
Score                                                       10
Topic                                                    panic
Title_pro                                           im curious
Text_pro     im curious what age range and genders are of o...
Name: 12947, dtype: object


In [113]:
#Remove numbers
print(df.loc[16])
df['Title_pro'] = df['Title_pro'].str.replace('\d+', '', regex=True)
df['Text_pro'] = df['Text_pro'].str.replace('\d+', '', regex=True)
print(df.loc[16])

Title            Punched my boss in the face, broke his nose. 
Text         I (30M) punched my boss (45ishM) square in the...
Score                                                      200
Topic                                                    anger
Title_pro          punched my boss in the face broke his nose 
Text_pro     i 30m punched my boss 45ishm square in the fac...
Name: 16, dtype: object
Title            Punched my boss in the face, broke his nose. 
Text         I (30M) punched my boss (45ishM) square in the...
Score                                                      200
Topic                                                    anger
Title_pro          punched my boss in the face broke his nose 
Text_pro     i m punched my boss ishm square in the face kn...
Name: 16, dtype: object


# Advanced Preprocessing

In [114]:
#Advanced Preprocessing
!pip install nltk
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [115]:
#Tokenizing
print(df.loc[0])
df['Title_pro'] = df['Title_pro'].apply(nltk.word_tokenize)
df['Text_pro'] = df['Text_pro'].apply(nltk.word_tokenize)
print(df.loc[0])

Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      741
Topic                                                    anger
Title_pro    your anger may be caused by a magnesium defici...
Text_pro     i tried absolutely everything to fix my chroni...
Name: 0, dtype: object
Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      741
Topic                                                    anger
Title_pro    [your, anger, may, be, caused, by, a, magnesiu...
Text_pro     [i, tried, absolutely, everything, to, fix, my...
Name: 0, dtype: object


In [116]:
#Remove Stop Words
print(df.loc[0])
stop_words = set(stopwords.words('english'))
df['Title_pro'] = df['Title_pro'].apply(lambda x: [word for word in x if word not in stop_words])
df['Text_pro'] = df['Text_pro'].apply(lambda x: [word for word in x if word not in stop_words])
print(df.loc[0])

Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      741
Topic                                                    anger
Title_pro    [your, anger, may, be, caused, by, a, magnesiu...
Text_pro     [i, tried, absolutely, everything, to, fix, my...
Name: 0, dtype: object
Title        Your anger may be caused by a magnesium defici...
Text         I tried absolutely everything to fix my chroni...
Score                                                      741
Topic                                                    anger
Title_pro    [anger, may, caused, magnesium, deficiency, ne...
Text_pro     [tried, absolutely, everything, fix, chronic, ...
Name: 0, dtype: object


In [117]:
#Lemmitization
print(df.loc[20])
lemmatizer = WordNetLemmatizer()
df['Title_pro'] = df['Title_pro'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['Text_pro'] = df['Text_pro'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.loc[20])

Title                “Anger is the part of you that loves you”
Text         After hearing this, it really changed the way ...
Score                                                      176
Topic                                                    anger
Title_pro                                 [anger, part, loves]
Text_pro     [hearing, really, changed, way, view, anger, m...
Name: 20, dtype: object
Title                “Anger is the part of you that loves you”
Text         After hearing this, it really changed the way ...
Score                                                      176
Topic                                                    anger
Title_pro                                  [anger, part, love]
Text_pro     [hearing, really, changed, way, view, anger, m...
Name: 20, dtype: object


# Save to New CSV

In [118]:
df.to_csv('DSC 288 Processed Reddit Dataset.csv', index=False)