In [7]:
import string
import nltk
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer

nltk.download("stopwords")

import string
import re

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:20.4f}'.format

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import SGDClassifier

import numpy as np

from scipy import sparse

import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
test = pd.read_csv("s3://advancedml-koch-mathur-hinkson/test.csv")

In [10]:
test.shape

(97320, 2)

In [2]:
train = pd.read_csv("s3://advancedml-koch-mathur-hinkson/train.csv")

In [8]:
train.shape

(1804874, 45)

In [25]:
n = 100000  #chunk row size
train_sub_dfs = [train[i:i+n] for i in range(0,train.shape[0],n)]

[i.shape for i in train_sub_dfs]

[(100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (100000, 45),
 (4874, 45)]

In [3]:
# intialize stemmer
ps = PorterStemmer() 
ls = LancasterStemmer()

# define stopwords
stops = set(stopwords.words('english'))
stops.add('')

approved_stop_words = {"not", "get", "against", "haven", "haven't","aren't", 
                       "aren", "should", "shouldn", "shouldn't", "themselves", 
                       "them", "under", "over", 'won', "won't", "wouldn'", 
                       "wouldn't"}

stops = stops - approved_stop_words

In [4]:
def clean_text(text, stop_ws=stops, stemmer=ps, str_output=True):
    '''
    This auxiliary function cleans text.
    
    Methods used for cleaning are: 
        (1) transform string of text to list of words,
        (2) cleaned (lowercase, remove punctuation) and remove stop words,
        (3) Porter stemming of cleaned (lowercase, remove punctuation) text, 
        (4) Lancaster stemming of cleaned (lowercase, remove punctuation), 
        (5) cleaned (lowercase, remove punctuation) without removing stop words.
    
    Inputs:
        text (string) - A string of text.
        stemming (parameter) - either Porter or Lancaster stemming method
        remove_sw (boolean) - True/False remove stop words
    
    Outputs:
        Cleaned text per the input parameters.
    '''
    t = text.replace("-", " ").split(" ")
    t = [w.strip(string.punctuation) for w in t]
    
    if stop_ws:
        t = [w.lower() for w in t if w not in stop_ws]
    
    if stemmer:
        t = [stemmer.stem(w) for w in t]
    
    if str_output:
        return ' '.join(t)
    else:
        return t
    

In [5]:
def make_ngrams(preprocessed, n=2, str_output=True):
    '''
    Covert a list of preprocessed strings into ngrams of length n.
    Should return X ngrams of X words less (n - 1).
    '''
    ngrams_tuples = []

    # ensure that all ngrams are of length n by specifying list position of
    # first item in last ngram
    last_ngram_start = len(preprocessed) - (n - 1)

    # for each string from position i through last ngram start position, create
    # a tuple of length n
    for i in range(last_ngram_start):
        ngrams_tuples.append(tuple(preprocessed[i:i + n]))
    if str_output:
        return [' '.join(ngram) for ngram in ngrams_tuples]
    else:
        return ngrams_tuples

In [6]:
def print_elapsed_time(start, end, m):
    print(f"{m}...Elapsed Time:  {round((end - start)/60,3)} minutes")


In [None]:
def generate_categorical_features(df):
    start_time = time.perf_counter()

    df['split'] = df["comment_text"].apply(lambda x: x.split(" "))
    split_time = time.perf_counter()
    print_elapsed_time(start_time, split_time, m="Split comments")

    df['cleaned_w_stopwords_str'] = df["comment_text"].apply(clean_text,args=(None,None,True),)
    df['cleaned_w_stopwords'] = df["comment_text"].apply(clean_text,args=(None,None,False),)
    with_stopwords = time.perf_counter()
    print_elapsed_time(split_time, with_stopwords, m="Cleaned with stopwords")

    df['cleaned_no_stem_str'] = df["comment_text"].apply(clean_text,args=(stops,None, True),)
    df['cleaned_no_stem'] = df["comment_text"].apply(clean_text,args=(stops,None,False),)
    without_stopwords = time.perf_counter()
    print_elapsed_time(with_stopwords, without_stopwords, m="Cleaned without stopwords")

    df['cleaned_porter_str'] = df["comment_text"].apply(clean_text,args=(stops,ps,True),)
    df['cleaned_porter'] = df["comment_text"].apply(clean_text,args=(stops,ps,False),)
    porter_time = time.perf_counter()
    print_elapsed_time(without_stopwords, porter_time, m="Stemmed (Porter)")

    df['cleaned_lancaster_str'] = df["comment_text"].apply(clean_text,args=(stops,ls,True),)
    df['cleaned_lancaster'] = df["comment_text"].apply(clean_text,args=(stops,ls,False),)
    lancaster_time = time.perf_counter()
    print_elapsed_time(porter_time, lancaster_time, m="Stemmed (Lancaster)")

    df['bigrams_unstemmed'] = df["cleaned_no_stem"].apply(make_ngrams,args=(2, True),)
    bigrams_time = time.perf_counter()
    print_elapsed_time(lancaster_time, bigrams_time, m="Created bigrams")
    # df['trigram_porter'] = df["cleaned_porter"].apply(make_ngrams,args=(3, True),)
    # df['fourgram_porter'] = df["cleaned_porter"].apply(make_ngrams,args=(4, True),)
    # df['fivegram_porter'] = df["cleaned_porter"].apply(make_ngrams,args=(5, True),)
    #
    # df['bigram_lancaster'] = df["cleaned_lancaster"].apply(make_ngrams,args=(2, True),)
    # df['trigram_lancaster'] = df["cleaned_lancaster"].apply(make_ngrams,args=(3, True),)
    # df['fourgram_lancaster'] = df["cleaned_lancaster"].apply(make_ngrams,args=(4, True),)
    # df['fivegram_lancaster'] = df["cleaned_lancaster"].apply(make_ngrams,args=(5, True),)

    return df

In [None]:
def generate_continuous_features(df):

    df['perc_upper'] = df["comment_text"].apply(lambda x: 0 if x == 0 else round((len(re.findall(r'[A-Z]',x)) / len(x)), 3))
    pct_upper_time = time.perf_counter()
    print_elapsed_time(bigrams_time, pct_upper_time, m="Calculated uppercase pct")

    df['num_exclam'] = df["comment_text"].apply(lambda x:(len(re.findall(r'!',x))))
    punctuation_time = time.perf_counter()
    print_elapsed_time(pct_upper_time, punctuation_time, m="Count punctuation")

    df['num_words'] = df["split"].apply(lambda x: len(x))
    wordcount_time = time.perf_counter()
    print_elapsed_time(punctuation_time, wordcount_time, m="Count words")

    calc_stopwords_pct = lambda x, y: 0 if y == 0 else round((x - len(y)) / x, 3)
    df['perc_stopwords'] = df[["num_words", "cleaned_no_stem"]].apply(lambda x: calc_stopwords_pct(*x), axis=1)
    stops_pct_time = time.perf_counter()
    print_elapsed_time(wordcount_time, stops_pct_time, m="Count stopwords pct")

    df['num_upper_words'] = df["split"].apply(lambda x: sum(map(str.isupper, x)) )
    ct_upper_time = time.perf_counter()
    print_elapsed_time(stops_pct_time, ct_upper_time, m="Count uppercase words")

    return df

In [81]:
test_preprocessed = add_text_cleaning_cols(test) 

Split comments...Elapsed Time:0.01 minutes
Cleaned with stopwords...Elapsed Time:0.059 minutes
Cleaned without stopwords...Elapsed Time:0.082 minutes
Stemmed (Porter)...Elapsed Time:2.248 minutes
Stemmed (Lancaster)...Elapsed Time:1.815 minutes
Created bigrams...Elapsed Time:4.25 minutes
Calculated uppercase pct...Elapsed Time:0.009 minutes
Count punctuation...Elapsed Time:0.003 minutes
Count words...Elapsed Time:0.001 minutes
Count stopwords pct...Elapsed Time:0.035 minutes
Count uppercase words...Elapsed Time:0.008 minutes


In [82]:
test_preprocessed.to_pickle('test_preprocessed.pkl')

In [85]:
mini = train.head(10000)

In [87]:
mini_preprocessed = add_text_cleaning_cols(mini) 

Split comments...Elapsed Time:1.369 minutes
Cleaned with stopwords...Elapsed Time:2.512 minutes
Cleaned without stopwords...Elapsed Time:2.426 minutes
Stemmed (Porter)...Elapsed Time:2.696 minutes
Stemmed (Lancaster)...Elapsed Time:2.899 minutes
Created bigrams...Elapsed Time:13.037 minutes
Calculated uppercase pct...Elapsed Time:1.193 minutes
Count punctuation...Elapsed Time:1.187 minutes
Count words...Elapsed Time:1.183 minutes
Count stopwords pct...Elapsed Time:1.165 minutes
Count uppercase words...Elapsed Time:1.236 minutes


In [88]:
mini_preprocessed.to_pickle('mini_preprocessed.pkl')

In [None]:
train_preprocessed = gener 

In [None]:
train_preprocessed.to_pickle('train_preprocessed.pkl')