In [34]:
# Common modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, auc, roc_curve

# Decision tree specific modules
from sklearn import tree

from scipy.sparse import load_npz

from sklearn.feature_extraction.text import CountVectorizer #Can use tfidffvectorizer as well
import pandas as pd 
import os
import re
import numpy as np

from nltk.stem import PorterStemmer
from matplotlib import pyplot as plt
from collections import defaultdict, Counter

In [35]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [36]:
X_train = load_npz(os.path.join(DATA_PATH, 'training_feats.npz'))

In [37]:
X_test = load_npz(os.path.join(DATA_PATH, 'test_feats.npz'))

In [38]:
X_train.shape

(49308, 35520)

In [39]:
X_test.shape

(74659, 35520)

In [40]:
train_df = pd.read_json(os.path.join(BASE_PATH, '01-milestone1', 'imputed_train.json'))
# test_df =  pd.read_json(os.path.join(DATA_PATH, 'test.json.zip'))

In [41]:
# train_df = pd.DataFrame(train_df)
# test_df =  pd.DataFrame(test_df)

In [42]:
# X_train = train_df.drop(columns=['interest_level'])
y_train = train_df['interest_level']

# Preprocessing
As described in milestone 1, we will do some feature extraction.


In [12]:
def preprocess(train_df, min_feats=5):
    bathrooms = train_df['bathrooms']
    bedrooms = train_df['bedrooms']
    building_ids = train_df['building_id']
    latitudes = train_df['latitude']
    longitudes = train_df['longitude']
    manager_ids = train_df['manager_id']
    prices = train_df['price']
    
    datetime = pd.to_datetime(train_df['created'])
    
    months = datetime.dt.month
    days = datetime.dt.day
    hours = datetime.dt.hour
    
    # Where Monday = 0, and Sunday = 6
    weekdays = datetime.dt.dayofweek
    
    num_photos = train_df['photos'].str.len()
    
    features = train_df['features'].apply(lambda x: [i.lower() for i in x])

    feature_counts = Counter()
    for feature in features.tolist():
        feature_counts.update(feature)
    feature = sorted([k for (k,v) in feature_counts.items() if v > min_feats])
    
    key2original = defaultdict(list)
    k = 4
    for f in feature:
        cleaned = clean(f)
        key = cleaned[:k].strip()
        key2original[key].append(f)
    
    columns = list(key2original.keys())
    
    # reverse hash table
    original2key = {}
    for col in columns:
        for original in key2original[col]:
            original2key[original] = col
            
    all_listing_features = {}

    for index,row in train_df.iterrows():
        listing_features = {}
        features_found = []
        for feature in row['features']:
            feature = feature.lower()
            if feature in original2key:
                features_found.append(original2key[feature])
        for feature in columns:
                if feature not in features_found:
                    listing_features[feature] = 0
                else:
                    listing_features[feature] = 1
        all_listing_features[row['listing_id']] = listing_features

    one_hot_features = pd.DataFrame.from_dict(all_listing_features, orient='index')
    
    # Description attribute
    
    descriptions = train_df[['description']]
    # Removes symbols, numbers and stem the words to reduce dimentional space
    stemmer = PorterStemmer()
    
    descriptions['description_new'] = descriptions.description.apply(lambda x: clean_description(x, stemmer))

    cvect_desc = CountVectorizer(stop_words='english', max_features=200)
    full_sparse = cvect_desc.fit_transform(descriptions.description_new)

    # Renaming words to avoid collisions with other feature names in the model
    col_desc = ['desc_'+ i for i in cvect_desc.get_feature_names()] 
    count_vect_df = pd.DataFrame(full_sparse.todense(), columns=col_desc)
    descriptions = pd.concat([descriptions.reset_index(), count_vect_df],axis=1)
    
    descriptions = descriptions.drop(labels=['description', 'index', 'description_new'], axis=1)
    descriptions.index = train_df['listing_id']
    
    # The final dataframe to be returned
    final_train_df = pd.DataFrame()
    
    final_train_df['bathrooms'] = bathrooms
    final_train_df['bedrooms'] = bedrooms
    final_train_df['building_ids'] = building_ids 
    final_train_df['latitudes'] = latitudes
    final_train_df['longitudes'] = longitudes
    final_train_df['manager_ids'] = manager_ids
    final_train_df['prices'] = prices
    final_train_df['months'] = months
    final_train_df['days'] = days
    final_train_df['hours'] = hours
    final_train_df['weekdays'] = weekdays
    final_train_df['num_photos'] = num_photos
    final_train_df.index = train_df['listing_id']
    
    final_train_df = final_train_df.merge(descriptions, left_index=True, right_index=True)
    final_train_df = final_train_df.merge(one_hot_features, left_index=True, right_index=True)
    
    final_train_df = pd.concat([final_train_df, pd.get_dummies(final_train_df['building_ids'], prefix='building')], axis=1)
    final_train_df = pd.concat([final_train_df, pd.get_dummies(final_train_df['manager_ids'], prefix='manager')], axis=1)
    final_train_df = final_train_df.drop(['building_ids', 'manager_ids'], axis=1)

    return final_train_df
    

In [13]:
def clean(s):
    x = s.replace("-", "")
    x = x.replace(" ", "")
    x = x.replace("twenty four hour", "24")
    x = x.replace("24/7", "24")
    x = x.replace("24hr", "24")
    x = x.replace("24-hour", "24")
    x = x.replace("24hour", "24")
    x = x.replace("24 hour", "24")
    x = x.replace("common", "cm")
    x = x.replace("concierge", "doorman")
    x = x.replace("bicycle", "bike")
    x = x.replace("private", "pv")
    x = x.replace("deco", "dc")
    x = x.replace("decorative", "dc")
    x = x.replace("onsite", "os")
    x = x.replace("outdoor", "od")
    x = x.replace("ss appliances", "stainless")
    return x

def clean_description(x, stemmer):
    regex = re.compile('[^a-zA-Z ]')
    # For user clarity, broken it into three steps
    i = regex.sub(' ', x).lower()
    i = i.split(" ") 
    i= [stemmer.stem(l) for l in i]
    i= " ".join([l.strip() for l in i if (len(l)>2) ]) # Keeping words that have length greater than 2
    return i

def feature_hash(x):
    cleaned = clean(x, uniq)
    key = cleaned[:4].strip()
    return key

In [17]:
# X_train = preprocess(X_train)
# X_test = preprocess(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
# # For submission purposes, we need to store the listing_id of X_test
# X_test_listing_ids = X_test.index

## Training the model

We first instantiate a decision tree model and train it naively using all the features.

In [43]:
# Convert labels from {low, medium high} -> {0, 1, 2}
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

## Train-test split

We need to split the training data into a training set and a validation set (to evaluate the performance of the model).

For now we do 5-fold cross_validation.

In [44]:
seed = 36201431

In [45]:
from sklearn.feature_selection import mutual_info_classif

In [49]:
mutual_info = mutual_info_classif(X_train, y_train, random_state=seed)

In [50]:
top_1000_features = mutual_info.argsort()[-1000:][::-1]

In [47]:
X_train.shape

(49308, 35520)

In [48]:
X_test.shape

(74659, 35520)

In [51]:
# Only use these top 1000 features when training, to eliminate overfitting.
X_train_final = X_train[:,top_1000_features]
X_test_final = X_test[:,top_1000_features]

In [56]:
criterions = ['gini', 'entropy']
for criterion in criterions:
    model = tree.DecisionTreeClassifier(criterion=criterion)
    scores = cross_validate(model, X_train_final, y_train, scoring='neg_log_loss', cv=5)
    # Convert negative log loss to log loss
    test_scores = -1 * scores['test_score']
    print("Log loss score using {0} as the criterion:".format(criterion))
    print(test_scores.mean().round(4))

Log loss score using gini as the criterion:
11.6219
Log loss score using entropy as the criterion:
11.6409


Now we can try training on the entire train set. We will obtain a model, train it on the test dataset, then submit to kaggle for evaluation.

In [57]:
model = tree.DecisionTreeClassifier(criterion='gini')
model.fit(X_train_final, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [58]:
y_pred = model.predict_proba(X_test_final)

In [59]:
le.classes_

array(['high', 'low', 'medium'], dtype=object)

Since the order is not correct (low corresponds to the label 1, but in the csv they expect the low to be the third column), we must swap columns for y_pred.

In [None]:
def create_submission_csv(X_test, y_pred):
    df = pd.DataFrame(y_pred, columns=le.classes_)
    df.index = X_test.index
    df.index.name = 'listing_id'
    return df

In [None]:
output = create_submission_csv(X_test, y_pred)
output.to_csv('decision_tree_predictions_1.csv')

# Score from Kaggle
9.18820 <- ignore, this is the old score

Some additional things to try:

- entropy for information gain
- limiting max_depth to like 30 or so
- class_weight = balanced
- ccp_alpha = ??? 