In [1]:
import numpy as np
import pandas
from sklearn import clone
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Load data

In [2]:
csv_path = 'marriott_data/marriott_dataset1.csv'
df_marriot = pandas.read_csv(csv_path, dtype={'Account': np.int_})

# Make BoW

In [3]:
list_detailed_descr = list_detailed_descr = df_marriot['DetailedDescr'].tolist()

# by characters
vectorizer = CountVectorizer(lowercase=True, ngram_range=(2,15), analyzer='char_wb')
document_term_matrix = vectorizer.fit_transform(list_detailed_descr)  # scipy.sparse.csr.csr_matrix

# Transform dataframes into features/target

In [4]:
Y = df_marriot['Account']

In [5]:
# Convert scipy.sparse.csr.csr_matrix to Dataframe,
# since Pandas does not acceot this Scipy type
# http://stackoverflow.com/a/17819427

X = pandas.SparseDataFrame([pandas.SparseSeries(document_term_matrix[i].toarray().ravel())
                            for i in np.arange(document_term_matrix.shape[0])])

# Train-test split

In [6]:
# train_test_split `train_size` defaults to `0.25`
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [7]:
# Training set
len(X_train), len(Y_train)

(82, 82)

In [8]:
# Test set
len(X_test), len(Y_test)

(28, 28)

# Scale

In [9]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [10]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Classifications

All of the following withholding the same 14 items for evaluation.

In [11]:
def classify_and_score(classifier, X_train_scaled, Y_train, X_test_scaled, Y_test, print_report=False):
    classifiers = ['decision_tree', 'linear_svc', 'random_forest']
    if classifier == 'decision_tree':
        clf = DecisionTreeClassifier()
    elif classifier == 'linear_svc':
        clf = svm.LinearSVC()
    elif classifier == 'random_forest':
        n_estimators = 30
        clf = RandomForestClassifier(n_estimators=n_estimators)
        clf = clone(clf)
    clf.fit(X_train_scaled, Y_train)
    
    Y_prediction = clf.predict(X_test_scaled)
    if print_report:
        print(clf)
        print()
        print(classification_report(Y_test, Y_prediction))
    return clf.score(X_test_scaled, Y_test)

In [12]:
classify_and_score('decision_tree', X_train_scaled, Y_train, X_test_scaled, Y_test)

0.14285714285714285

In [13]:
classify_and_score('linear_svc', X_train_scaled, Y_train, X_test_scaled, Y_test)

0.071428571428571425

In [14]:
classify_and_score('random_forest', X_train_scaled, Y_train, X_test_scaled, Y_test)

0.14285714285714285