In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from scipy.sparse import coo_matrix, hstack
import pandas as pd

In [6]:
data = pd.read_pickle('data_clean_text.p')

In [45]:
X = data.drop(['bathrooms', 'latitude', 'longitude', 'building_id', 'created', 'street_address', 'description', 'display_address', 'features', 'listing_id', 'manager_id', 'photos', 'interest_level', 'neighborhood'],axis=1)
Y = data['interest_level'].copy()
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8)

In [42]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """Switch for selecting text or non text features"""
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        if self.key=='text':
            return data['cleantext']
        else:
            return data.drop('cleantext',axis=1)


class VectorChooser(BaseEstimator, TransformerMixin):
    """Switch for choosing between vectorizers"""
    def __init__(self, vtype='tfidf',binary=False,ngram_range=(1,1)):
        self.vtype = vtype
        self.tfidf = TfidfVectorizer(binary,ngram_range,stop_words='english')
        self.count_vector = CountVectorizer(binary,ngram_range,stop_words='english')

    def fit(self, x, y=None):
        if self.vtype=='tfidf':
            return self.tfidf.fit(x)
        else:
            return self.count_vector.fit(x)

    def transform(self, data):
        if self.vtype=='tfidf':
            return self.tfidf.transform(data)
        else:
            return self.count_vector.transform(data)

In [43]:
pipeline = Pipeline([
    # Extract the subject & body

    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text field
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('vectorizer', VectorChooser()),
            ])),

            # Pipeline for K best selection of non-text features
            ('alt_features', Pipeline([
                ('selector', ItemSelector(key='features')),
                ('kbest', SelectKBest())
            ]))

        ]
    )),

    # Use a SVC classifier on the combined features
    ('logistic',LogisticRegression())
])

In [46]:
pipeline.fit(X_train,Y_train)

  f = msb / msw


Pipeline(steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('selector', ItemSelector(key='text')), ('vectorizer', VectorChooser(binary=None, ngram_range=None, vtype='tfidf'))])), ('alt_features', Pipeline(steps=[('selector', ItemSelector(key='features')), ('kbest', Sel...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [47]:
sum((pipeline.predict(X_test)==Y_test)/len(Y_test))

0.70104346064226208

In [None]:
binary_vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
binary_vectorizer.fit(X['cleantext'])

In [None]:
X_train_vect = binary_vectorizer.transform(X_train['cleantext'])
X_test_vect = binary_vectorizer.transform(X_test['cleantext'])

In [None]:
X_train_bin = X_train.drop(['cleantext'],axis=1)
data_coo = coo_matrix(X_train_bin.values,dtype=np.float64)
X_train_data = hstack([data_coo,X_train_vect])

In [None]:
X_test_bin = X_test.drop(['cleantext'],axis=1)
data_coo = coo_matrix(X_test_bin.values,dtype=np.float64)
X_test_data = hstack([data_coo,X_test_vect])