# Forum Posts Processing

## Text Normalization

In [1]:
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
import spacy
nlp = spacy.load('en_core_web_lg')

In [2]:
#function for tokenize text
def tokenize_text(text): 
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens] #remove whitespace in tokens
    return tokens

#function for expand contractions
def expand_contractions(text, contraction_mapping): # contraction mapping is CONTRACTION_MAP from custom contraction.py file
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), #from CONTRACTION_MAP in nltk
                                      flags=re.IGNORECASE|re.DOTALL)             # match all contraction keys in CONTRACTION_MAP
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower()) #get key from dictionary + try lower case 
        expanded_contraction = first_char+expanded_contraction[1:] #keep first char constant and add expanded contraction 
                                                                   #from 2nd char onwards
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text) # replace matched contraaction pattern with expanded one
    expanded_text = re.sub("'", "", expanded_text) #remove the ' from expanded text
    return expanded_text

# function for POS tags 
from nltk.corpus import wordnet as wn
def pos_tag_text(text): # convert spacy tags to wordnet tags to use wordnet lemmatizer
    def wn_tags(token_pos):
        if token_pos == 'ADJ':
            return wn.ADJ
        if token_pos == 'VERB':
            return wn.VERB
        if token_pos == 'NOUN':
            return wn.NOUN
        if token_pos == 'ADV':
            return wn.ADV
        else:
            return None
    text = nlp(text)
    tagged_text = [(token.orth_,token.pos_) for token in text]
    tagged_text = [(token[0].lower(),wn_tags(token[1])) for token in tagged_text] # convert tags and words to lowercase
    return tagged_text

# function to lemmatize text based on POS tags 
wnl = WordNetLemmatizer()
   
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag # return lemmatized word if pos tag present
                         else word                               # just return word if pos tag is "None"
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens) #join tokens with " " between them
    return lemmatized_text

# function to remove special character
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) #punctuation string from re module
    filtered_tokens = [pattern.sub('', token) for token in tokens] #replace matching special character with " " 
    filtered_text = ' '.join(filtered_tokens) #join tokens with " " between them
    return filtered_text

stopword_list = nltk.corpus.stopwords.words('english')
stopword_list = list(set(stopword_list)-set(['not','against','down']))
stopword_list = stopword_list + ['would', 'could', 'come', 'go', 'get',
                                 'tell', 'listen', 'one', 'two', 'three',
                                 'four', 'five', 'six', 'seven', 'eight',
                                 'nine', 'zero', 'join', 'find', 'make',
                                 'say', 'ask', 'tell', 'see', 'try', 'back',
                                 'also','send','iphone','forum','use','mobile','app']

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list] # only extract tokens not in stopword list
    filtered_text = ' '.join(filtered_tokens) #join tokens with " " between them
    return filtered_text

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token): #keep only text character 
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def normalize_corpus(corpus, lemmatize=True, 
                     only_text_chars=False,
                     tokenize=False):
    
    normalized_corpus = []    
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower() #text already lower in pos tagging in lemmatize function
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)
        
        if tokenize:
            text = tokenize_text(text) #option to tokenize text if it's not already tokenized
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus

In [3]:
import pandas as pd
#grasscity.com forum
raw = pd.read_csv('grasscity.csv',header=0)

#cannabis.com forum
# raw1 = pd.read_csv('cannabis1.csv',header=0)
# raw2 = pd.read_csv('cannabis2.csv',header=0)
# raw = pd.concat([raw1,raw2],axis = 0)
# raw = raw.reset_index(drop=True)

#marijuana.com forum
# raw1 = pd.read_csv('marijuana1.csv',header=0)
# raw2 = pd.read_csv('marijuana2.csv',header=0)
# raw = pd.concat([raw1,raw2],axis = 0)
# raw = raw.reset_index(drop=True)

raw.head()

Unnamed: 0,title,username,date,content
0,Driving?,Raex,"Dec 28, 2011",I've been wondering about this for a while no...
1,Driving?,jaykewashere,"Dec 28, 2011","I didn't know driving not high was possible ,..."
2,Driving?,rastaballer209,"Dec 28, 2011",Maybe if your a desent driver.i do and a few ...
3,Driving?,amandaa125,"Dec 28, 2011",Well I don't think they could give you a DUI ...
4,Driving?,The Nickatina,"Dec 28, 2011",as lonq as you dont qo too deep into your tho...


In [4]:
dataset = raw['content']
dataset.head()

0     I've been wondering about this for a while no...
1     I didn't know driving not high was possible ,...
2     Maybe if your a desent driver.i do and a few ...
3     Well I don't think they could give you a DUI ...
4     as lonq as you dont qo too deep into your tho...
Name: content, dtype: object

In [5]:
import numpy as np
posts = np.array(dataset)
norm_posts = normalize_corpus(posts,
                              lemmatize=True,
                              only_text_chars=True)  
print (norm_posts[0])

wonder favorite method relaxation sober drive hour hour music obviously appeal high notice lot member quite often whenever friend need drive somewhere actually designate driver always pretty sure dui applicable influence substance view drive high


In [6]:
# only for marijuana.com due to encoding issues
# dataset = np.array(raw['content'])  
# def normalize(corpus):
#     normalized_corpus = []    
#     for text in corpus:
#           text = keep_text_characters(text)
#           normalized_corpus.append(text)
#     return normalized_corpus
# dataset = normalize(dataset)
# dataset = pd.DataFrame(dataset)
# dataset.head()

In [7]:
cleaned = pd.DataFrame(norm_posts, columns=['processed'])
cleaned['original'] = dataset

def drive_relevance(text):
    tokens = tokenize_text(text)
    relevant_tokens = [token for token in tokens if token in ['drive','driver','dui']] 
    relevance = len(relevant_tokens)   
    return relevance

cleaned['drive_relevance'] = [drive_relevance(post) for post in cleaned['processed']]
cleaned = cleaned.loc[cleaned['drive_relevance']>0,]

def weed_relevance(text):
    tokens = tokenize_text(text)
    relevant_tokens = [token for token in tokens if token in ['stone','stoner','high','zooted','bowl','bake','joint','blaze',
                                                              'toke','toked','toking','marijuana','cannabis']] 
    relevance = len(relevant_tokens)   
    return relevance

cleaned['weed_relevance'] = [weed_relevance(post) for post in cleaned['processed']]
cleaned = cleaned.loc[cleaned['weed_relevance']>0,]

cleaned=cleaned.drop_duplicates()

#cleaned.to_csv("cleaned.csv",header= True, index=False)

In [8]:
cleaned.head()

Unnamed: 0,processed,original,drive_relevance,weed_relevance
0,wonder favorite method relaxation sober drive ...,I've been wondering about this for a while no...,5,2
1,not know drive not high possible put hole road...,"I didn't know driving not high was possible ,...",1,1
3,well not think give dui drive high unlike alco...,Well I don't think they could give you a DUI ...,2,2
5,quote name amandaa125 well not think give dui ...,"[quote name='""amandaa125""']Well I don't think...",2,3
6,pretty funny like spot piss test splash cop ac...,It'd be pretty funny if it were like an on th...,1,1


In [9]:
cleaned.shape

(2265, 4)