In [1]:
df = pd.read_parquet('review-business.parquet')
df.head(1)

Unnamed: 0,business_id,categories,city,latitude,longitude,name,neighborhoods,open,review_count,stars,state,date,review_id,user_id,text
0,-tCiAYMimz6yQaaXiK7e6Q,"[American (Traditional), Restaurants]",Las Vegas,36.086334,-115.129363,Village Pub & Grill,[Southeast],True,45,3.0,NV,2014-08-26,UWNK-FcCyq5dyBs4rt4lHQ,wKiefYFGWmleBgfsoDlYow,I love this place every time I fly out of here...


In [2]:
# Selects used features
df = df[['categories', 'city', 'latitude', 'longitude', 'review_count', 'stars', 'state', 'text']]
df['target'] = df['categories'].apply(set(['Nightlife']).issubset)
del df['categories']

# Shows some rows
df[df['target'] == True].head(3)
df[df['target'] == False].head(3)

Unnamed: 0,city,latitude,longitude,review_count,stars,state,text,target
26,Las Vegas,36.127754,-115.224318,19,4.0,NV,I've only been here a couple of times. In fact...,True
27,Las Vegas,36.127754,-115.224318,19,4.0,NV,"Used to be my high school hangout. School, wor...",True
49,Pittsburgh,40.429391,-79.922302,4,3.5,PA,Some of my friends really like this place so w...,True


Unnamed: 0,city,latitude,longitude,review_count,stars,state,text,target
0,Las Vegas,36.086334,-115.129363,45,3.0,NV,I love this place every time I fly out of here...,False
1,Las Vegas,36.18336,-115.30983,46,4.0,NV,Capistrami great as always. \n\nThe service en...,False
2,Las Vegas,36.18336,-115.30983,46,4.0,NV,delish.\n\ncheese steak. to die for\n\ni alway...,False


In [3]:
nightlife = df[df['target'] == True]
len(nightlife)

10549

In [4]:
restaurants = df[df['target'] == False]
len(restaurants)

42589

In [5]:
# Down-sampling to mitigate class-imbalanced data
df = pd.concat([nightlife, restaurants.sample(n=10549)])

In [20]:
from sklearn.model_selection import train_test_split

X = df[df.columns[df.columns != 'target']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, shuffle=True, stratify=None)

In [7]:
# From text to a feature vector
from sklearn.feature_extraction import text

# bow_transform = text.CountVectorizer(max_features=500, min_df=0.0, max_df=1.0)
bow_transform = text.CountVectorizer()
X_train_bow = bow_transform.fit_transform(X_train['text'])
X_text_bow = bow_transform.transform(X_test['text'])
len(bow_transform.vocabulary_)

35294

In [8]:
# Feature scaling (TF-IDF & L2 normalization) 
tfidf_trfm = text.TfidfTransformer(norm='l2')
X_train_tfidf = tfidf_trfm.fit_transform(X_train_bow)
X_test_tfidf = tfidf_trfm.transform(X_text_bow)

In [9]:
# Uses logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear', C=1.0).fit(X_train_tfidf, y_train)
clf.score(X_text_bow, y_test)

0.762085308056872

In [10]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=32)
clf.fit(X_train_tfidf, y_train)
clf.score(X_text_bow, y_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=32,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

0.586350710900474

In [11]:
# K-means model stacking 
from sklearn.cluster import MiniBatchKMeans
# from sklearn.cluster import KMeans

kmeans = MiniBatchKMeans(n_clusters=8, max_iter=300, batch_size=100, random_state=0)
# kmeans = KMeans(n_clusters=2, max_iter=300, random_state=0)
X_train_kmeans = kmeans.fit(X_train_tfidf)
X_test_kmeans = kmeans.predict(X_test_tfidf)
X_train_kmeans.labels_
X_test_kmeans

array([3, 3, 3, ..., 3, 7, 2], dtype=int32)

array([2, 6, 2, ..., 1, 3, 7], dtype=int32)

In [12]:
# Replaces texts with their cluster IDs
X_train['text_kmeans'] = X_train_kmeans.labels_
del X_train['text']
X_test['text_kmeans'] = X_test_kmeans
del X_test['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [13]:
X_train.head(1)
X_test.head(1)

Unnamed: 0,city,latitude,longitude,review_count,stars,state,text_kmeans
7359,Scottsdale,33.466126,-111.917539,151,3.0,AZ,3


Unnamed: 0,city,latitude,longitude,review_count,stars,state,text_kmeans
8481,Charlotte,35.161825,-80.737726,15,2.5,NC,2


In [14]:
import category_encoders as ce

oe = ce.OrdinalEncoder(cols=['city', 'state'], handle_unknown='impute')
X_train = oe.fit_transform(X_train)
X_test = oe.transform(X_test)

In [15]:
# Handles some errors
X_train['city'] = X_train['city'].fillna(0).astype(np.int32)
X_test['city'] = X_test['city'].fillna(0).astype(np.int32)

In [16]:
X_train.head(10)
X_test.head(10)

Unnamed: 0,city,latitude,longitude,review_count,stars,state,text_kmeans
7359,1,33.466126,-111.917539,151,3.0,1,3
40716,2,36.10259,-115.170541,702,4.5,2,3
22396,3,33.658619,-111.956581,13,4.0,1,3
22011,4,43.074872,-89.396256,51,3.0,3,4
43809,2,36.126177,-115.193971,1514,3.5,2,2
1935,2,36.10156,-115.172953,496,4.0,2,7
52547,2,36.117632,-115.174907,1542,4.0,2,3
20440,1,33.50109,-111.9255,32,4.5,1,3
28819,3,33.656209,-112.013147,87,3.0,1,7
21835,5,40.434176,-79.922987,73,3.5,4,7


Unnamed: 0,city,latitude,longitude,review_count,stars,state,text_kmeans
8481,18,35.161825,-80.737726,15,2.5,10.0,2
14730,3,33.451575,-112.069986,273,3.5,1.0,6
20407,21,33.654815,-112.188568,49,3.5,1.0,2
13725,2,36.068778,-115.17684,84,2.5,2.0,3
16370,2,36.118379,-115.17262,317,4.0,2.0,2
18598,12,55.950748,-3.190079,21,3.5,7.0,2
17789,21,33.54056,-112.2633,67,2.0,1.0,7
45932,2,36.158033,-115.3352,214,3.5,2.0,6
48378,1,33.526354,-111.924964,114,3.5,1.0,3
37006,4,43.092098,-89.354732,131,3.5,3.0,3


In [19]:
clf = tree.DecisionTreeClassifier(max_depth=32)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=32,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

0.8636

In [98]:
import pandas_profiling

X_test.profile_report(style={'full_width':True})

