In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest
import numpy as np
from sklearn.grid_search import GridSearchCV
import pandas as pd
from pprint import pprint
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from mord import LogisticAT
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

In [25]:
data = pd.read_pickle('data_clean_text.p')

In [35]:
X = data.drop(['bathrooms', 'latitude', 'longitude', 'building_id', 'created', 'street_address', 'description', 'display_address', 'features', 'listing_id', 'manager_id', 'photos', 'interest_level', 'neighborhood'],axis=1)
Y = data['interest_level'].copy()
#Y = Y.replace(['low', 'medium', 'high'], [1,2,3])

for column in X.columns.values:
    if column == 'cleantext':
        next
    else:
        if np.max(X[column])==0:
            next
        else:
            X[column] = X[column]/np.max(X[column])

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8)


In [4]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """Switch for selecting text or non text features"""
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        if self.key=='text':
            return data['cleantext']
        else:
            return data.drop('cleantext',axis=1)


class VectorChooser(BaseEstimator, TransformerMixin):
    """Switch for choosing between vectorizers"""
    def __init__(self, vtype='tfidf',binary=False,ngram_range=(1,1)):
        self.vtype = vtype
        self.tfidf = TfidfVectorizer(binary,ngram_range,stop_words='english')
        self.count_vector = CountVectorizer(binary,ngram_range,stop_words='english')

    def fit(self, x, y=None):
        if self.vtype=='tfidf':
            return self.tfidf.fit(x)
        else:
            return self.count_vector.fit(x)

    def transform(self, data):
        if self.vtype=='tfidf':
            return self.tfidf.transform(data)
        else:
            return self.count_vector.transform(data)

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
# Logistic Regression
pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text field
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('vectorizer', VectorChooser(binary = True)),
            ])),

            # Pipeline for K best selection of non-text features
            ('alt_features', Pipeline([
                ('selector', ItemSelector(key='features')),
                ('kbest', SelectKBest())
            ]))

        ]
    )),

    ('logistic', LogisticRegression())
])

parameters = {
    'logistic__solver': ('sag', 'lbfgs', 'newton-cg')
}

#Grid search across one model
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1,scoring = 'mean_absolute_error')

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
grid_search.fit(X_train, Y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['union', 'logistic']
parameters:
{'logistic__solver': ('sag', 'lbfgs', 'newton-cg')}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   44.9s finished


Best score: -0.356
Best parameters set:
	logistic__solver: 'sag'


In [152]:
# Random Forest
pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text field
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('vectorizer', VectorChooser()),
            ])),

            # Pipeline for K best selection of non-text features
            ('alt_features', Pipeline([
                ('selector', ItemSelector(key='features')),
                ('kbest', SelectKBest())
            ]))

        ]
    )),

    ('randomforest', RandomForestClassifier())
])
          
parameters = {
    'union__text__vectorizer__ngram_range': ((1,1),(1, 2)),
    'union__text__vectorizer__binary': (True, False),
    'randomforest__min_samples_split': (1,10,100),
    'randomforest__min_samples_leaf': (10,100,1000),
}

#Grid search across one model
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring = 'mean_absolute_error')

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
grid_search.fit(X_train, Y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['union', 'randomforest']
parameters:
{'randomforest__min_samples_leaf': (10, 100, 1000),
 'randomforest__min_samples_split': (1, 10, 100),
 'union__text__vectorizer__binary': (True, False),
 'union__text__vectorizer__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  4.6min finished


Best score: -0.379
Best parameters set:
	randomforest__min_samples_leaf: 10
	randomforest__min_samples_split: 1
	union__text__vectorizer__binary: True
	union__text__vectorizer__ngram_range: (1, 2)


In [150]:
#Naive Bayes
pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text field
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('vectorizer', VectorChooser()),
            ])),

            # Pipeline for K best selection of non-text features
            ('alt_features', Pipeline([
                ('selector', ItemSelector(key='features')),
                ('kbest', SelectKBest())
            ]))

        ]
    )),

    ('nb', BernoulliNB())
])
          
parameters = {
    'union__text__vectorizer__ngram_range': ((1,1),(1, 2)),
    'union__text__vectorizer__binary': (True, False),
    'nb__alpha': np.power(10.0, np.arange(-2,2)),
}

#Grid search across one model
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring = 'mean_absolute_error')

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
grid_search.fit(X_train, Y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['union', 'nb']
parameters:
{'nb__alpha': array([  0.01,   0.1 ,   1.  ,  10.  ]),
 'union__text__vectorizer__binary': (True, False),
 'union__text__vectorizer__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.2min finished


Best score: -0.387
Best parameters set:
	nb__alpha: 10.0
	union__text__vectorizer__binary: True
	union__text__vectorizer__ngram_range: (1, 1)


In [18]:
#Logistic AT
pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text field
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('vectorizer', VectorChooser()),
            ])),

            # Pipeline for K best selection of non-text features
            ('alt_features', Pipeline([
                ('selector', ItemSelector(key='features')),
                ('kbest', SelectKBest())
            ]))

        ]
    )),

    ('logisticat', LogisticAT())
])
          
parameters = {
    'union__text__vectorizer__ngram_range': ((1,1),(1, 2),(2,2)),
    'union__text__vectorizer__binary': (True, False),
    'union__text__vectorizer__vtype': ('tfidf', 'count_vectorizer'),
    'union__alt_features__kbest__k': (5, 10, 15, 20),
    'logisticat__alpha': (.0001,.001,.01,.1,1,10,100,1000,10000)
}

#Grid search across one model
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1,scoring = 'mean_absolute_error')

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
grid_search.fit(X_train, Y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['union', 'logisticat']
parameters:
{'logisticat__alpha': (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000),
 'union__alt_features__kbest__k': (5, 10, 15, 20),
 'union__text__vectorizer__binary': (True, False),
 'union__text__vectorizer__ngram_range': ((1, 1), (1, 2), (2, 2)),
 'union__text__vectorizer__vtype': ('tfidf', 'count_vectorizer')}
Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.0min


KeyboardInterrupt: 

In [None]:
binary_vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
binary_vectorizer.fit(X['cleantext'])

In [None]:
X_train_vect = binary_vectorizer.transform(X_train['cleantext'])
X_test_vect = binary_vectorizer.transform(X_test['cleantext'])

In [None]:
X_train_bin = X_train.drop(['cleantext'],axis=1)
data_coo = coo_matrix(X_train_bin.values,dtype=np.float64)
X_train_data = hstack([data_coo,X_train_vect])

In [None]:
X_test_bin = X_test.drop(['cleantext'],axis=1)
data_coo = coo_matrix(X_test_bin.values,dtype=np.float64)
X_test_data = hstack([data_coo,X_test_vect])