## Setup

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
RANDOM_STATE = 28

## Loading Data 

In [4]:
# Comments File
comments = '../Data/CommentsApril2018.csv'

In [5]:
# Read in File
comm = pd.read_csv(comments)[:26492]

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
comm.columns

Index(['approveDate', 'articleID', 'articleWordCount', 'commentBody',
       'commentID', 'commentSequence', 'commentTitle', 'commentType',
       'createDate', 'depth', 'editorsSelection', 'inReplyTo', 'newDesk',
       'parentID', 'parentUserDisplayName', 'permID', 'picURL', 'printPage',
       'recommendations', 'recommendedFlag', 'replyCount', 'reportAbuseFlag',
       'sectionName', 'sharing', 'status', 'timespeople', 'trusted',
       'typeOfMaterial', 'updateDate', 'userDisplayName', 'userID',
       'userLocation', 'userTitle', 'userURL'],
      dtype='object')

In [8]:
# Filter out Unknown & comment commentTypes & Non-NAs
filt = (comm.sectionName != "Unknown") & (comm.commentType == "comment") & comm.sectionName.notna()
data = comm.commentBody[filt]
sections = comm.sectionName[filt]

## Train/Test Split and Training

In [9]:
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [10]:
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.toarray()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [11]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

gmm_model = Pipeline([('vec', vectorizer),
                      ('to_dense', DenseTransformer()),
                       ('clf', GaussianMixture(random_state=RANDOM_STATE))])

In [12]:
gmm_model.fit(train_data, train_target)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...var=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None))])

## Predictions and Metric Evaluation

In [13]:
gmm_model.score(test_data, test_target)

-43365.41810532012

## Save Model

In [None]:
from joblib import dump, load
dump(gmm_model, '../Models/gmm_small.joblib') 



## Save Notebook

In [None]:
import dill
dill.dump_session('../Notebook_Saves/GMM_Exploration.db')