In [2]:
from utils import build_corpus
from sklearn.base import BaseEstimator, TransformerMixin
import warnings; warnings.simplefilter('ignore')

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

class Vectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vectorizer='count', ngram_range=(1,1), max_df=1.0, min_df=1, norm='l2'):
        self.vectorizer = vectorizer
        self.ngram_range = ngram_range
        self.max_df = max_df
        self.min_df = min_df
        self.norm = norm
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.vectorizer == 'tfidf':
            vec = TfidfVectorizer(ngram_range=self.ngram_range, max_df=self.max_df, min_df=self.min_df, norm=self.norm)
        else:
            vec = CountVectorizer(ngram_range=self.ngram_range, max_df=self.max_df, min_df=self.min_df)
        return vec.fit_transform(X)

class OptionalSVD(BaseEstimator, TransformerMixin):
    def __init__(self, compute=False, n_components=2):
        self.compute = compute
        self.n_components = n_components
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.compute:
            svd = TruncatedSVD(n_components=self.n_components, random_state=42).fit(X)
            self.components_ = svd.components_
            self.explained_variance_ = svd.explained_variance_
            self.explained_variance_ratio_ = svd.explained_variance_ratio_
            self.singular_values_ = svd.singular_values_
            X = svd.transform(X)
        return X
    
class OptionalScaler(BaseEstimator, TransformerMixin):
    def __init__(self, scale=False):
        self.scale = scale
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.scale:
            X = StandardScaler().fit_transform(X)
        return X

In [4]:
from sklearn.model_selection import train_test_split

sources = (('../data/articles/breitbart', 'bb'), ('../data/articles/thinkprogress', 'tp'))
data = build_corpus(sources)
X = data['text'].copy()
y = data['class'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

vectorizer = 'count'
ngram_range = (1,1)
max_df = 1.0
min_df = 1
norm = 'l2'
reduce = True
n_components = 100
scale = False

pipeline = Pipeline([
    ('vect', Vectorizer(vectorizer, ngram_range, max_df, min_df, norm)),
    ('reducer', OptionalSVD(reduce, n_components)),
    ('scaler', OptionalScaler(scale)),
    ('estimator', LinearRegression())
])

#data_prepared = pipeline.fit_transform(X_train)

In [12]:
from sklearn.model_selection import cross_val_score
import numpy as np

lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, data_prepared, y_train, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)
print(lin_rmse_scores)

[0.4382944  0.40974743 0.38050611 0.36649439 0.41808696 0.40623127
 0.38003281 0.38540035 0.38216054 0.38572819]


In [15]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(lin_rmse_scores)

Scores: [0.4382944  0.40974743 0.38050611 0.36649439 0.41808696 0.40623127
 0.38003281 0.38540035 0.38216054 0.38572819]
Mean: 0.3952682436131147
Standard deviation: 0.020832552827688793


In [None]:
from sklearn.model_selection import GridSearchCV

#param_grid = [
#    {'vectorizer':['count', 'tfidf'], 'ngram_range':[(1,1), (1,2), (1,3)], 'max_df': [1.0, 0.9,0.8], 'min_df': [1, 0.05, 0.2], 
#     'reduce': [True,False], 'n_components': [100,200,300], 'scale': [True, False]}
#]

param_grid = [
    {'vect__vectorizer': ['tfidf'], 'vect__max_df': (1.0, 0.9, 0.8), 'vect__min_df': (1, 0.05, 0.2)},
    {'vect__vectorizer': ['tfidf'], 'reducer__reduce': (True,False), 'scaler__scale': (True,False), 'scaler__n_components': (100,200,300)}
]
#param_grid = {
#    'vect__vectorizer': ('count', 'tfidf')
#}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [19]:
grid_search.best_estimator_

Pipeline(memory=None,
     steps=[('vect', Vectorizer(max_df=1.0, min_df=1, ngram_range=(1, 1), norm='l2',
      vectorizer='count')), ('reducer', OptionalSVD(compute=True, n_components=100)), ('scaler', OptionalScaler(scale=False)), ('estimator', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])