In [256]:
import pandas as pd
import numpy as np
import os
import glob
import re

In [257]:
import nltk
# nltk.download('stopwords') 
# nltk.download('words')
from nltk.corpus import stopwords
from nltk.corpus import words as common_words

In [258]:
from symspellpy.symspellpy import SymSpell, Verbosity

def loadCommonDictionary():
    try:
        words = common_words.words()
    except OSError:
        print("Common Word Dictionary Not Found")
    else:
        return words

def loadComplexDictionary():
    try: 
        dictionary_path = os.path.join(os.getcwd(), "symspellpy_frequency_dist.txt") 
        return dictionary_path
    except OSError:
        print("File Not Found")

In [259]:
symObj = SymSpell(max_dictionary_edit_distance = 2, prefix_length = 7, compact_level = 2)
commonWords = loadCommonDictionary()
dictionary = loadComplexDictionary()
symObj.load_dictionary(dictionary, term_index = 0, count_index = 1)

True

In [260]:
os.getcwd() #'C:\\Users\\ThomasTheisen\\Documents\\suicidewatchdata'
mylist = [f for f in glob.glob("*.csv")]

In [261]:
columns = ['post_id', 'user_id', 'timestamp', 'subreddit', 'post_title', 'post_body']
_data = pd.DataFrame(columns=columns)
for file in mylist:
    d = pd.read_csv(file)
    data = _data.append(d, ignore_index=True)

In [262]:
from datetime import datetime
def utc_to_real(utc_ts):
    return datetime.utcfromtimestamp(int(utc_ts)).strftime('%Y-%m-%d %H:%M:%S')

In [263]:
def join_title_and_body(dataset):
    cols = ['post_title', 'post_body']
    dataset['text'] = dataset[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    dataset.drop(cols, inplace=True, axis=1)

In [264]:
def remove_URLS(text):
    return re.sub(r'http\S+', '', text)

In [265]:
def to_lower_case(text):
    return text.lower()

In [266]:
def remove_numbers(text):
    return ''.join([i for i in text if not i.isdigit()])

In [267]:
def remove_punc(text):
    return re.sub(r'[^\w\s]','', text)

In [268]:
def remove_(text):
    return text.replace('_',' ')

In [269]:
def remove_repeat_words(text):
    toRemove = []
    prev = None
    split_text = text.split()
    for index, word in enumerate(split_text):
        if prev == word:
            toRemove.append(index)
        prev = word
    for index in sorted(toRemove, reverse=True):
        del split_text[index]
    return ' '.join(split_text)

In [270]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords.words('english')])

In [271]:
def complex_correction(word):
    suggestions = symObj.lookup(phrase = word, verbosity = Verbosity.CLOSEST, max_edit_distance = 1)
    return suggestions

def spell_check(text):
    split_text = text.split()
    for index, word in enumerate(split_text):
        if word not in commonWords:
            suggestions = complex_correction(word)
            if len(suggestions) != 0:
                split_text[index] = split_text[index].replace(word, suggestions[0].term)
    return ' '.join(split_text)

In [272]:
data['timestamp'] = data.apply(lambda x: utc_to_real(x['timestamp']), axis=1)
join_title_and_body(data)
data['text'] = data.apply(lambda x: remove_URLS(x['text']), axis=1)
data['text'] = data.apply(lambda x: to_lower_case(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_numbers(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_punc(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_repeat_words(x['text']), axis=1)
data['text'] = data.apply(lambda x: spell_check(x['text']), axis=1)
data['text'] = data.apply(lambda x: remove_stopwords(x['text']), axis=1)

In [274]:
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
def lemma_to_verb(text):
    split_text = text.split()
    for index, word in enumerate(split_text):
        lemma = wordnet_lemmatizer.lemmatize(word, 'v')
        split_text[index] = lemma
    return ' '.join(split_text)

In [275]:
data['text'] = data.apply(lambda x: lemma_to_verb(x['text']), axis=1)

In [285]:
data['class'] = np.where(data['subreddit'] == 'SuicideWatch', 1, 0)