# This files cleans the data. Collect data first in "pushshift_func.ipynb" 

# Imports

In [390]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from nltk.corpus import stopwords
import string

In [391]:
df_comment = pd.read_csv('../data/comment.csv')

# Info

In [392]:
df_comment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116443 entries, 0 to 116442
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   subreddit  116443 non-null  object
 1   body       116443 non-null  object
 2   auth       116443 non-null  object
 3   time       116443 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 3.6+ MB


In [393]:
df_comment['subreddit'].value_counts()

mildlyinteresting    58355
interestingasfuck    58088
Name: subreddit, dtype: int64

In [394]:
df_comment.duplicated().sum()

338

# Removing Duplicates, Deleted/Removed, and Bots

In [395]:
df_comment.drop_duplicates(inplace=True)

In [396]:
# removes comments with urls
df_comment = df_comment[df_comment['body'].str.contains('http') == False]

In [397]:
# removes comments that either have the author or comment deleted
# removes all automoderator posts
df_comment = df_comment[df_comment['auth'] != '[deleted]']
df_comment = df_comment[df_comment['body'] != '[removed]']
df_comment = df_comment[df_comment['auth'] != 'AutoModerator']

In [398]:
# removes comments of users asking to save a video
df_comment = df_comment[df_comment['body'].str.contains('savevideo') == False]

In [399]:
# list of authors that were for sure bots. There will be more as more data is added most likely
bots = ['reply-guy-bot', 'savevideobot', 'stabbot', 'SaveVideo', 'fakefakebotdetective']

# removes comments of bots listed above
df_comment = df_comment[df_comment['auth'].isin(bots) == False]

In [400]:
df_comment[df_comment['auth'].str.contains('bot')]

Unnamed: 0,subreddit,body,auth,time
915,mildlyinteresting,Well that's a weird super power,thewholerobot,1642803964
1155,mildlyinteresting,You're definitely creepy,thewholerobot,1642802325
1163,mildlyinteresting,Are you sitting nekkid on your phone maybe?,thewholerobot,1642802275
1167,mildlyinteresting,Eye balls,thewholerobot,1642802248
1543,interestingasfuck,No no no! This is reddit and we must dedicate...,TinFoilRobotProphet,1642800741
...,...,...,...,...
113808,mildlyinteresting,Unless you're a municipality than profit away!...,kelvin_klein_bottle,1642108225
113946,mildlyinteresting,The thing is he could jerk off in front of me ...,Spacbot,1642106999
114232,mildlyinteresting,How is your shorter hallway a raise?,cudntbebothered,1642104589
114504,mildlyinteresting,Naw. Doesn't even peak for another 100years bruh.,thewholerobot,1642102462


In [401]:
# removing new line chars and first round of stripping whitespace
df_comment = df_comment.replace([r'\n', r'\r'],' ', regex=True)
df_comment['body'] = df_comment['body'].str.strip()

# Feature Engineering

In [402]:
# encoding subreddits
# interestingasfuck = 1
# mildlyinteresting = 0
df_comment['subreddit'] = df_comment['subreddit'].map({'interestingasfuck':1, 'mildlyinteresting':0})

In [403]:
# counting number of chars and number of words per comment. assign to new feature
df_comment['num_of_chars'] = df_comment['body'].map(lambda x: len(x))
df_comment['word_count'] = df_comment['body'].map(lambda x: len(x.split()))

In [404]:
# count number of capital letters in each comment. assign to feature
def capital_letters(post):
    count = 0
    for letter in post:
        if letter.isupper():
            count +=1
    return count

df_comment['capital_count'] = df_comment['body'].map(capital_letters)

In [405]:
# checking to see if ?, !, or ... are in a comment. if so, assign a 1, if not assign 0
df_comment['question_mark'] = df_comment['body'].map(lambda x: 1 if '?' in x else 0)
df_comment['exclaimation'] = df_comment['body'].map(lambda x: 1 if '!' in x else 0)
df_comment['dot_dot_dot'] = df_comment['body'].map(lambda x: 1 if '...' in x else 0)

In [406]:
# list of regex to see if a comment has text in quotes. accounts for multiple ways a user would use quotes
quotes = [
    '^["]{1}[^"]+["]{1}$',
    '[^"]["]{1}[^"]+["]{1}[^"]',
    '[^"]?["]{1}[^"]+["]{1}[^"]',
    '[^"]["]{1}[^"]+["]{1}[^"]?'
]

# function to apply regex pattern and check if it's found. if so, assign 1. otherwise 0
def find_quotes(comment):
    if re.findall(quotes[0], comment) or re.findall(quotes[1], comment) or re.findall(quotes[2], comment) or re.findall(quotes[3], comment):
        return 1
    else:
        return 0

df_comment['quotes'] = df_comment['body'].map(find_quotes)

In [407]:
# list of regex to see if a comment has text in italics. accounts for multiple ways a user would use italics
italics = [
    '^[*]{1}[^*]+[*]{1}$',
    '[^*][*]{1}[^*]+[*]{1}[^*]',
    '[^*]?[*]{1}[^*]+[*]{1}[^*]',
    '[^*][*]{1}[^*]+[*]{1}[^*]?'
]

# function to apply regex pattern and check if it's found. if so, assign 1. otherwise 0
def find_italics(comment):
    if re.findall(italics[0], comment) or re.findall(italics[1], comment) or re.findall(italics[2], comment) or re.findall(italics[3], comment):
        return 1
    else:
        return 0

df_comment['italics'] = df_comment['body'].map(find_italics)

In [408]:
# list of regex to see if a comment has text in bold. accounts for multiple ways a user would use bold
bold = [
    '^[*]{2}[^*]+[*]{2}$',
    '[^*][*]{2}[^*]+[*]{2}[^*]',
    '[^*]?[*]{2}[^*]+[*]{2}[^*]',
    '[^*][*]{2}[^*]+[*]{2}[^*]?'
]

# function to apply regex pattern and check if it's found. if so, assign 1. otherwise 0
def find_bold(comment):
    if re.findall(bold[0], comment) or re.findall(bold[1], comment) or re.findall(bold[2], comment) or re.findall(bold[3], comment):
        return 1
    else:
        return 0

df_comment['bold'] = df_comment['body'].map(find_bold)

### Sentiment

In [409]:
# FUNCTION TAKEN FROM CLASS LESSON
def detect_polarity(text):
    '''Accepts text and returns the polarity'''
    
    return TextBlob(text).sentiment.polarity

In [410]:
# FUNCTION TAKEN FROM CLASS LESSON
def detect_subjectivity(text):
    '''Accepts text and returns the subjectivity'''
    
    return TextBlob(text).sentiment.subjectivity

In [411]:
# applies functions to get polarity and subjectivity of the comment
df_comment['polarity'] = df_comment['body'].map(detect_polarity)
df_comment['subjectivity'] = df_comment['body'].map(detect_subjectivity)

# Cleaning Comments

In [None]:
# turns all text into lowercase
df_comment['body'] = df_comment['body'].str.lower()

In [None]:
# replace unicode apostrophe with normal one. will account for more contractions to expand
df_comment['body'] = df_comment['body'].map(lambda x: x.replace("’", "'"))

In [None]:
# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# contraction diction taken from stackoverflow. will be used to expand contractions in text

contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

In [None]:
# turns contraction keys into a list to iterate over
contractions_list = list(contractions.keys())

In [None]:
# https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
# Learned this and took it from stackoverflow. Author username is Brian
# strips all punctuation

df_comment['body'] = df_comment['body'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [None]:
# function to expand contractions
def expand_contractions(post):
    expanded_sentence = []

    for word in post.split():
        if word.lower() in contractions_list:
            word = contractions[word.lower()]

        expanded_sentence.append(word)

    return ' '.join(expanded_sentence)

df_comment['body'] = df_comment['body'].map(expand_contractions)

In [None]:
# removes all "'s" from the end of words
df_comment['body'] = df_comment['body'].map(lambda x: x.replace("'s", ""))

In [None]:
# removes all comments that just have a space or are blank
df_comment = df_comment[df_comment['body'] != ' ']
df_comment = df_comment[df_comment['body'] != '']

In [None]:
# function to get the average length of a word for a comment
# this feature had to be engineered after the cleaning above was done
def get_avg_word_length(post):
    word_list = []

    for word in post.split():
        word_list.append(len(word))

    return round(np.mean(word_list))

df_comment['avg_word_length'] = df_comment['body'].map(get_avg_word_length)

In [None]:
# list of custom stop words
custom_stop_words = [
    "yep",
    "yes",
    "yup",
    "no",
    "nope",
    "na",
    "lol",
    "idk",
    "hi",
    "yo",
    "ok",
    "ha",
    "le",
    "wa",
    "nt",
    "dont",
    "youre",
    "thats",
    "got",
    "im",
    "ive",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    'h',
    'i',
    'j',
    'k',
    'l',
    'm',
    'n',
    'o',
    'p',
    'q',
    'r',
    's',
    't',
    'u',
    'v',
    'w',
    'x',
    'y',
    'z',
    'like'
]

In [None]:
# combines nltk's stopwords with my custom list
stop_words = stopwords.words('english') + custom_stop_words

In [None]:
# function to count the stop words of a comment
# this feature had to be engineered after the cleaning above was done
def stop_word_count(post):
    count = 0
    for word in post.split():
        if word.lower().strip() in stop_words:
            count += 1
    
    return count

df_comment['stop_word_count'] = df_comment['body'].map(stop_word_count)

In [None]:
# function to remove stop words
def stop_word_remove(post):
    filtered_sentence = []

    for word in post.split():
        if word.lower().strip() not in stop_words:
            filtered_sentence.append(word)

    return ' '.join(filtered_sentence)

df_comment['body'] = df_comment['body'].map(stop_word_remove)

In [None]:
# removes anything except letters and spaces
df_comment['body'] = df_comment['body'].map(lambda x: re.sub("[^A-Za-z ]", " ", x))

# removes excessive whitespace and strips whitespace
df_comment['body'] = df_comment['body'].map(lambda x: re.sub('\s+', ' ', x))
df_comment['body'] = df_comment['body'].str.strip()

In [None]:
# removes posts that are just a space or have nothing
df_comment = df_comment[df_comment['body'] != ' ']
df_comment = df_comment[df_comment['body'] != '']

In [None]:
# drops auth and time columns because they wont be used
df_comment.drop(columns=['auth', 'time'], inplace=True)

# Saving cleaned data

In [None]:
# saves cleaned data to new csv
df_comment.to_csv('../data/comment_clean.csv', index=False)

# Data is now clean and can be modelled in "models.ipynb" or perform EDA in "eda.ipynb"