In [1]:
# import the necessary packages
import pandas as pd
import pickle
import datetime
from dateutil import parser
import re

In [3]:
data = pd.read_pickle('pickled_data.p')

text_raw = [t for t in data['description']]

#to lowercase
text_lower = [s.lower() for s in text_raw]

#remove any of below characters and replace with space - excesses will be removed later
text_lower = [re.sub('[#!?*";,())]',' ',s) for s in text_lower]

#remove text artifacts
text_lower = [re.sub('website_redacted',' ',s) for s in text_lower]
text_lower = [re.sub('\xa0',' ',s) for s in text_lower]
text_lower = [re.sub('\r','',s) for s in text_lower]
text_lower = [re.sub('\t','',s) for s in text_lower]
text_lower = [re.sub('_',' ',s) for s in text_lower]

# remove numbers of 3 or more digits as long as they are not preceded by a $
# text_lower = [re.sub('(?<!([$]|\d))\d{3,}',' ',s) for s in text_lower]

#remove $ only if it does not precede a number then remove all numbers
text_lower = [re.sub('([$](?!\d))',' ',s) for s in text_lower]
text_lower = [re.sub('\d{3,}',' ',s) for s in text_lower]
text_lower = [re.sub('.00',' ',s) for s in text_lower]

#remove below characters only if they are not preceded by a number
text_lower = [re.sub('((?<!\d)[.:-])',' ',s) for s in text_lower]

#insert spaces between letters and numbers where 
text_lower = [re.sub(r'((?<=[a-z])\d)',r' \1',s) for s in text_lower]
text_lower = [re.sub(r'(\d(?=[a-z]))',r'\1 ',s) for s in text_lower]

#remove all html tags
text_lower = [re.sub('(<.*?>)',' ',s) for s in text_lower]

#replace ampersand with and
text_lower = [re.sub('[&]',' and ',s) for s in text_lower]

#replace w/ with "with"
text_lower = [re.sub('w/',' with ',s) for s in text_lower]

#replace / symbols with space
text_lower = [re.sub('/',' ',s) for s in text_lower]

#replace excess spaces
text_lower = [re.sub(' +',' ',s).strip() for s in text_lower]

#adjust important abbreviations
text_lower = [re.sub(' br ',' bedroom ',s) for s in text_lower]
text_lower = [re.sub('sq ft','square feet',s) for s in text_lower]
text_lower = [re.sub(' ss ',' stainless steel',s) for s in text_lower]
text_lower = [re.sub(' s s ',' staineless steel ',s) for s in text_lower]


#remove trailing "<a"
text_cleaned = [re.sub('<a','',s).strip() for s in text_lower]


In [4]:
data['cleantext'] = text_cleaned
data.drop('description',axis=1)
data.to_pickle('data_clean_text.p')

done


In [None]:
X = data.drop(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'street_address', 'features', 'interest_level', 'latitude',
       'listing_id', 'longitude', 'manager_id', 'photos','neighborhood','display_address'],axis=1)
Y = data['interest_level'].copy()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from scipy.sparse import coo_matrix, hstack

In [None]:
binary_vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words='english')
binary_vectorizer.fit(X['cleantext'])

In [None]:
X_train_vect = binary_vectorizer.transform(X_train['cleantext'])
X_test_vect = binary_vectorizer.transform(X_test['cleantext'])

In [None]:
X_train_bin = X_train.drop(['cleantext'],axis=1)
data_coo = coo_matrix(X_train_bin.values,dtype=np.float64)
X_train_data = hstack([data_coo,X_train_vect])

In [None]:
X_test_bin = X_test.drop(['cleantext'],axis=1)
data_coo = coo_matrix(X_test_bin.values,dtype=np.float64)
X_test_data = hstack([data_coo,X_test_vect])


In [None]:
pickle.to_pickle()