In [109]:
__author__ = 'Ming Li'
# This application forms a submission from Ming in regards to leaf classification challenge on Kaggle community
import tensorflow as tf
from tensorflow.contrib import learn
from sklearn import metrics, cross_validation, naive_bayes, preprocessing, \
pipeline, linear_model, tree, decomposition, ensemble
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
from sklearn.externals.six import StringIO
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
from minglib import gradient_descent
import pydotplus
warnings.filterwarnings('ignore')

In [110]:
# set display right
pd.set_option('display.width', 4000)
pd.set_option('max_colwidth', 4000)
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 200)
pd.set_option('float_format', '%.9f')

In [111]:
test = pd.read_csv('data/leaf/test.csv')
train = pd.read_csv('data/leaf/train.csv')

In [112]:
# train.dtypes

In [113]:
regressors = train.select_dtypes(exclude=('int', 'object')).copy()
regressand = train.select_dtypes(exclude=('int', 'float')).copy()

# Codifying types of species

In [114]:
regressand['species_id'] = pd.Categorical.from_array(regressand['species']).codes
mapping = regressand[['species_id','species']].set_index('species_id').to_dict()['species']
regressand.drop('species', axis=1, inplace=True)

# model generalization

In [115]:
kf_generator = cross_validation.KFold(train.shape[0], n_folds=10, shuffle=True, random_state=1)

# Feature Scaling

In [116]:
regressors_std = regressors.apply(preprocessing.scale, axis=0)  # using standard deviation as denominator

# Logistic Regression

In [117]:
regressors_std = np.column_stack((np.ones(regressors_std.shape[0]), regressors_std))  # add constant 1

In [118]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(regressors_std, regressand, test_size=.2, random_state=1)

In [119]:
reg = linear_model.LogisticRegression(fit_intercept=False)  # regressors contain no manual constant

In [120]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [121]:
prediction = reg.predict(x_test)

In [122]:
reg.coef_.shape # 99 one-vs-rest logistic regression coefficients x 192 features

(99, 193)

In [123]:
metrics.accuracy_score(y_test, prediction)

0.96969696969696972

In [124]:
scores = cross_validation.cross_val_score(reg, regressors_std, regressand, scoring='accuracy', cv=kf_generator)
np.mean(scores)

0.97272727272727266

In [140]:
test['species'] = np.nan
test = test[train.columns]

In [143]:
combined = pd.concat([test,train])

In [145]:
combined.sort_values('id', inplace=True)

In [146]:
regressors = combined.select_dtypes(exclude=('int', 'object')).copy()
regressors_std = regressors.apply(preprocessing.scale, axis=0)  # using standard deviation as denominator
regressors_std = np.column_stack((np.ones(regressors_std.shape[0]), regressors_std))  # add constant 1

In [147]:
combined['species_id'] = reg.predict(regressors_std)

In [148]:
combined['species_predicted'] = combined['species_id'].map(mapping)

In [163]:
result = combined.select_dtypes(include=('int','object')).copy()

In [190]:
result.iloc[1]

id                                       2
species              Pterocarya_Stenoptera
species_predicted    Pterocarya_Stenoptera
Name: 1, dtype: object

In [178]:
mapping[3]

'Acer_Opalus'

# Bayesian?

In [77]:
x_train, x_test, y_train, y_test = \
cross_validation.train_test_split(regressors, regressand, test_size=.2, random_state=1)

In [78]:
clf = naive_bayes.GaussianNB()

In [79]:
clf.fit(x_train, y_train)

GaussianNB()

In [80]:
prediction = clf.predict(x_test)

In [81]:
metrics.accuracy_score(y_test, prediction)

0.47474747474747475

In [82]:
scores = cross_validation.cross_val_score(clf, regressors, regressand, scoring='accuracy', cv=kf_generator)
np.mean(scores)

0.48181818181818176

# Tree

In [83]:
clf = ensemble.RandomForestClassifier(max_depth=100, max_leaf_nodes=500, min_samples_leaf=3, random_state=1)

In [84]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=500,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [85]:
prediction = clf.predict(x_test)

In [86]:
metrics.accuracy_score(y_test, prediction)

0.83333333333333337

In [87]:
scores = cross_validation.cross_val_score(clf, regressors, regressand, scoring='accuracy', cv=kf_generator)
np.mean(scores)

0.83131313131313134

# Neural Network

In [3]:
features = regressors.astype(float)
target = np.array(regressand).astype(int)

NameError: name 'regressors' is not defined

In [4]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(features, target, test_size=.2, random_state=1)

NameError: name 'features' is not defined

In [5]:
def main(unused_argv):

#     iris = learn.datasets.load_dataset('iris')
#     x_train, x_test, y_train, y_test = cross_validation.train_test_split(
#         iris.data, iris.target, test_size=0.2, random_state=42)

    # Build 3 layer DNN with 10, 20, 10 units respectively.
    feature_columns = learn.infer_real_valued_columns_from_input(x_train)
    classifier = learn.DNNClassifier(
        feature_columns=feature_columns, hidden_units=[20, 20, 20, 20], n_classes=99)

    # Fit and predict.
    classifier.fit(x_train, y_train, steps=200)
    predictions = list(classifier.predict(x_test, as_iterable=True))
    score = metrics.accuracy_score(y_test, predictions)
    print('Accuracy: {:.4f}'.format(score))
    # print(y_test, predictions)

In [6]:
if __name__ == '__main__':
  output = tf.app.run()

NameError: name 'x_train' is not defined

In [7]:
predictions

NameError: name 'predictions' is not defined