In [1]:
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer
import numpy as np
import re 

from nltk.corpus import stopwords

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
STOPWORD = stopwords.words('english')

lem = WordNetLemmatizer()
stem = PorterStemmer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Yong Han Ching\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\Yong Han
[nltk_data]     Ching\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Yong Han
[nltk_data]     Ching\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('../dataset/raw_data/fulltrain.csv', header = None)

In [27]:
# print(df[0].value_counts())

3    17870
1    14047
4     9995
2     6942
Name: 0, dtype: int64


In [8]:
def pos_to_morphy(pos_tag):
    """Convert POS tag to Morphy tag for Wordnet to recognise"""
    tag_dict = {"JJ": wordnet.ADJ,
                "NN": wordnet.NOUN,
                "VB": wordnet.VERB,
                "RB": wordnet.ADV}

    # Return tag if found, Noun if not found
    return tag_dict.get(pos_tag[:2], wordnet.NOUN)

In [231]:
def preprocess(document):
#     print(document)
    # Lowercasing all string elements
    document = [doc.lower() for doc in document]
    
    # Basic tokenization
    document = [re.sub(r'[\|/|-]', r' ', doc) for doc in document]
    
    #print(document)
    #         Stop word Removal
    filtered_words = []
    for doc in document:
        lst = [word for word in doc.split() if word not in STOPWORD]
        doc_string = ' '.join(lst)
        filtered_words.append(doc_string)
        
    document = filtered_words
    document = pd.Series(document)
    
    
# #     # ONLY ONE OF STEMMING OR LEMMATIZATION
# #     # Lemmatize to tokens after POS tagging -> Take very long?
#     document = [" ".join([lem.lemmatize(word.lower(), pos=pos_to_morphy(tag)) 
#                           for word, tag in pos_tag(TreebankWordTokenizer().tokenize(doc))]) for doc in document] 
    
    document = [" ".join([stem.stem(word.lower()) for word, tag in pos_tag(TreebankWordTokenizer().tokenize(doc))]) 
                for doc in document] 
    
    
    # Handle numbers: i.e. moneymoney used instead of <money> since will tokenize away the pointy brackets
    # Duplication of term will be used an "unique" term
    document = [re.sub(r'\$ +[0-9]+(.[0-9]+)?', 'moneymoney', doc) for doc in document]
    document = [re.sub(r'dollars?', 'moneymoney', doc) for doc in document]

    document = [re.sub(r'[0-9]+(.[0-9]+)? \%', 'percentpercent', doc) for doc in document]
    document = [re.sub(r'(\w)+ (percentage)+', 'percentpercent', doc) for doc in document]
    document = [re.sub(r'(\w)+ (\%|percent)+', 'percentpercent', doc) for doc in document]
    document = [re.sub(r'((hundred thousands?)|hundreds?|thousands?|millions?|billions?|trillions?)',
                            'numbernumber', doc) for doc in document]
    

#     print((document))

    return document

In [232]:
data = {'text': ['Tom to is this the loves this car 59%', '1000000 Joseph is playing amazingly', 'Krish is running great this MORNING!!!', 'John owes me $100']}  
  
# Create DataFrame  
data = pd.DataFrame(data)  

In [233]:
data['clean'] = preprocess(data['text'])

In [234]:
data.head()

Unnamed: 0,text,clean
0,Tom to is this the loves this car 59%,tom love percentpercent
1,1000000 Joseph is playing amazingly,1000000 joseph play amazingli
2,Krish is running great this MORNING!!!,krish run great morn ! ! !
3,John owes me $100,john owe moneymoney


In [235]:
df['clean'] = preprocess(df[1])

In [236]:
df.head()

Unnamed: 0,0,1,clean
0,1,"A little less than a decade ago, hockey fans w...","littl less decad ago , hockey fan bless slate ..."
1,1,The writers of the HBO series The Sopranos too...,writer hbo seri soprano took anoth dare storyt...
2,1,Despite claims from the TV news outlet to offe...,despit claim tv news outlet offer 'nonstop new...
3,1,After receiving 'subpar' service and experienc...,receiv 'subpar ' servic experienc unusu long w...
4,1,After watching his beloved Seattle Mariners pr...,watch belov seattl marin prevail san diego pad...


In [237]:
df.to_csv('../dataset/prep.csv') 