## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import PorterStemmer

In [3]:
RANDOM_STATE = 28

In [4]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    goodwords = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return goodwords


def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in words]


def tokenizer(text):
    return stemwords(tokenize(text))

## Loading Data 

In [5]:
# Comments File
comments = 'Data/CommentsApril2018.csv'
# comments = '/Users/Nina/Documents/classes/msds621/nina_nyt/nyt-comments/CommentsApril2018.csv'

In [6]:
# Read in File
comm = pd.read_csv(comments)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
comm.columns

Index(['approveDate', 'articleID', 'articleWordCount', 'commentBody',
       'commentID', 'commentSequence', 'commentTitle', 'commentType',
       'createDate', 'depth', 'editorsSelection', 'inReplyTo', 'newDesk',
       'parentID', 'parentUserDisplayName', 'permID', 'picURL', 'printPage',
       'recommendations', 'recommendedFlag', 'replyCount', 'reportAbuseFlag',
       'sectionName', 'sharing', 'status', 'timespeople', 'trusted',
       'typeOfMaterial', 'updateDate', 'userDisplayName', 'userID',
       'userLocation', 'userTitle', 'userURL'],
      dtype='object')

Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [8]:
comm.iloc[:, 32] = comm.iloc[:, 32].replace(np.nan, 'Unknown', regex=True); comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,Unknown,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",Unknown,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",Unknown,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",Unknown,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,Unknown,


In [9]:
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment")
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [10]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [11]:
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [12]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
#                             tokenizer=tokenizer,
                            stop_words='english',
                            decode_error='ignore')

gmm_model = Pipeline([('vec', vectorizer),
                      ('to_dense', DenseTransformer()),
                       ('clf', GaussianMixture(random_state=RANDOM_STATE))])

In [None]:
gmm_model.fit(train_data, train_target)

## Predictions and Metric Evaluation