In [1]:
import time
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

from sklearn import linear_model, cross_validation, feature_selection, manifold, decomposition, random_projection
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier,BaggingRegressor,RandomForestClassifier
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import log_loss

from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.grid_search import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline



In [2]:
train_df = pd.read_csv('train.csv')
train_df.fillna(0,inplace=True)
train_df

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.000000,...,0.007812,0.000000,0.002930,0.002930,0.035156,0.000000,0.000000,0.004883,0.000000,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.000000,0.031250,0.015625,0.025391,0.001953,0.019531,0.000000,...,0.000977,0.000000,0.000000,0.000977,0.023438,0.000000,0.000000,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.000000,...,0.154300,0.000000,0.005859,0.000977,0.007812,0.000000,0.000000,0.000000,0.020508,0.002930
3,5,Tilia_Tomentosa,0.000000,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.000000,...,0.000000,0.000977,0.000000,0.000000,0.020508,0.000000,0.000000,0.017578,0.000000,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.000000,...,0.096680,0.000000,0.021484,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031250
5,8,Magnolia_Salicifolia,0.070312,0.093750,0.033203,0.001953,0.000000,0.152340,0.007812,0.000000,...,0.145510,0.000000,0.041992,0.000000,0.005859,0.000000,0.000000,0.000000,0.001953,0.013672
6,10,Quercus_Canariensis,0.021484,0.031250,0.017578,0.009766,0.001953,0.042969,0.039062,0.000000,...,0.085938,0.000000,0.040039,0.000000,0.009766,0.000000,0.000000,0.000000,0.039062,0.003906
7,11,Quercus_Rubra,0.000000,0.000000,0.037109,0.050781,0.003906,0.000000,0.003906,0.000000,...,0.038086,0.025391,0.009766,0.002930,0.021484,0.000000,0.037109,0.006836,0.002930,0.036133
8,14,Quercus_Brantii,0.005859,0.001953,0.033203,0.015625,0.001953,0.000000,0.023438,0.000000,...,0.000000,0.000000,0.008789,0.000000,0.017578,0.000000,0.000000,0.000977,0.033203,0.074219
9,15,Salix_Fragilis,0.000000,0.000000,0.009766,0.037109,0.072266,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.070312,0.013672,0.192380,0.000000,0.074219,0.000000,0.000000


In [3]:
le = LabelEncoder().fit(train_df.species)
labels = le.transform(train_df.species)
labels

array([ 3, 49, 65, 94, 84, 40, 54, 78, 53, 89, 98, 16, 74, 50, 58, 31, 43,
        4, 75, 44, 83, 84, 13, 66, 15,  6, 73, 22, 73, 31, 36, 27, 94, 88,
       12, 28, 21, 25, 20, 60, 84, 65, 69, 58, 23, 76, 18, 52, 54,  9, 48,
       47, 64, 81, 83, 36, 58, 21, 81, 20, 62, 88, 34, 92, 79, 82, 20, 32,
        4, 84, 36, 35, 72, 60, 71, 72, 52, 50, 54, 11, 51, 18, 47,  5,  8,
       37, 97, 20, 33,  1, 59,  1, 56,  1,  9, 57, 20, 79, 29, 16, 32, 54,
       93, 10, 46, 59, 84, 76, 15, 10, 15,  0, 69,  4, 51, 51, 94, 36, 39,
       62,  2, 24, 26, 35, 25, 87,  0, 55, 34, 38,  1, 45,  7, 93, 56, 38,
       21, 51, 75, 81, 74, 33, 20, 37,  9, 40, 60, 31, 83, 50, 71, 67, 30,
       66,  1, 43, 61, 23, 65, 84, 87, 46, 57, 16,  2, 28, 12, 96, 44, 76,
       29, 75, 41, 87, 67, 61, 30,  5, 12, 62,  3, 83, 81,  6, 85,  4, 37,
       57, 84, 39, 71, 61,  6, 76, 14, 31, 98, 40, 17, 51, 16, 42, 63, 86,
       37, 69, 86, 71, 80, 78, 14, 35, 25,  5, 39,  8,  9, 26, 44, 60, 13,
       14, 77, 13, 80, 87

In [4]:
df = train_df.copy()
df.species = labels
df.species

0       3
1      49
2      65
3      94
4      84
5      40
6      54
7      78
8      53
9      89
10     98
11     16
12     74
13     50
14     58
15     31
16     43
17      4
18     75
19     44
20     83
21     84
22     13
23     66
24     15
25      6
26     73
27     22
28     73
29     31
       ..
960    85
961    89
962    94
963    45
964    48
965    86
966    81
967    14
968     4
969    77
970    56
971    82
972     2
973    85
974    70
975    88
976     0
977    75
978    14
979    86
980    81
981    97
982    70
983    72
984    34
985    40
986     5
987    11
988    78
989    50
Name: species, dtype: int64

In [5]:
df.ix[:,2:] = MinMaxScaler().fit_transform(train_df.ix[:,2:])
df

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,3,0.088883,0.114287,0.150003,0.022987,0.105264,0.031447,0.297875,0.000000,...,0.018181,0.000000,0.016951,0.014635,0.330258,0.000000,0.000000,0.012987,0.000000,0.179315
1,2,49,0.066662,0.000000,0.200000,0.091955,0.228070,0.006289,0.212763,0.000000,...,0.002274,0.000000,0.000000,0.004880,0.220178,0.000000,0.000000,0.002599,0.449433,0.158623
2,3,65,0.066662,0.047620,0.124998,0.045975,0.035085,0.018867,0.744676,0.000000,...,0.359096,0.000000,0.033896,0.004880,0.073387,0.000000,0.000000,0.000000,0.235957,0.020692
3,5,94,0.000000,0.019046,0.150003,0.034481,0.192976,0.062892,0.255324,0.000000,...,0.000000,0.004833,0.000000,0.000000,0.192654,0.000000,0.000000,0.046752,0.000000,0.337938
4,6,84,0.066662,0.019046,0.312499,0.057474,0.122806,0.050314,0.063826,0.000000,...,0.224999,0.000000,0.124293,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.220692
5,8,40,0.799991,0.457139,0.212499,0.011494,0.000000,0.490549,0.085101,0.000000,...,0.338639,0.000000,0.242939,0.000000,0.055040,0.000000,0.000000,0.000000,0.022470,0.096554
6,10,54,0.244439,0.152380,0.112499,0.057474,0.017542,0.138364,0.425526,0.000000,...,0.200000,0.000000,0.231640,0.000000,0.091743,0.000000,0.000000,0.000000,0.449433,0.027585
7,11,78,0.000000,0.000000,0.237498,0.298852,0.035085,0.000000,0.042550,0.000000,...,0.088636,0.125605,0.056500,0.014635,0.201822,0.000000,0.245154,0.018182,0.033711,0.255177
8,14,53,0.066662,0.009523,0.212499,0.091955,0.017542,0.000000,0.255324,0.000000,...,0.000000,0.000000,0.050848,0.000000,0.165129,0.000000,0.000000,0.002599,0.382021,0.524145
9,15,89,0.000000,0.000000,0.062502,0.218391,0.649115,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.351209,0.128436,0.332763,0.000000,0.197401,0.000000,0.000000


In [6]:
X = df.as_matrix()[:,2:]
y = df.as_matrix()[:,1]

In [7]:
params = {'C':[1500, 2000, 2500], 'tol': [0.0001]}
# solver='newton-cg' or 'lbfgs'
log_reg = linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=400)
clf = GridSearchCV(log_reg, params, scoring='log_loss', refit='True', n_jobs=-1, cv=10)
clf.fit(X, y)

print("best params: " + str(clf.best_params_))
for params, mean_score, scores in clf.grid_scores_:
  print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std(), params))
  print(scores)

best params: {'C': 2500, 'tol': 0.0001}
-0.032 (+/-0.017) for {'C': 1500, 'tol': 0.0001}
[-0.03869518 -0.06174347 -0.01108963 -0.03980355 -0.04127395 -0.04444972
 -0.03862206 -0.01300113 -0.00986382 -0.01933478]
-0.030 (+/-0.016) for {'C': 2000, 'tol': 0.0001}
[-0.03741562 -0.05982214 -0.00965182 -0.03783047 -0.04023911 -0.04297178
 -0.03700403 -0.01149971 -0.00854624 -0.01736484]
-0.029 (+/-0.016) for {'C': 2500, 'tol': 0.0001}
[-0.03664041 -0.05881879 -0.00872983 -0.03557165 -0.03970965 -0.04184902
 -0.03585814 -0.01064577 -0.00764909 -0.01595601]


In [8]:
estimator = clf
# estimator.fit(X,y)

In [9]:
test_data = pd.read_csv('test.csv')
test_df = DataFrame(MinMaxScaler().fit_transform(test_data.ix[:,1:]))
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,182,183,184,185,186,187,188,189,190,191
0,0.227268,0.051549,0.465113,0.071431,0.041664,0.057555,0.066662,0.000000,0.069763,0.279077,...,0.019337,0.000000,0.101911,0.005497,0.186047,0.000000,0.000000,0.000000,0.044941,0.359487
1,0.090903,0.030926,0.383717,0.059527,0.041664,0.050361,0.088883,0.000000,0.395349,0.279077,...,0.000000,0.000000,0.044586,0.010989,0.162793,0.000000,0.000000,0.003954,0.426962,0.300663
2,0.000000,0.000000,0.011627,0.130952,0.437504,0.000000,0.266671,0.000000,0.139538,0.069763,...,0.364647,0.000000,0.006372,0.000000,0.000000,0.000000,0.000000,0.063241,0.000000,0.000000
3,0.000000,0.000000,0.058141,0.071431,0.187499,0.000000,0.044441,0.000000,0.046509,0.023254,...,0.035910,0.035398,0.019110,0.203303,0.162793,0.000000,0.000000,0.363638,0.000000,0.058825
4,0.022726,0.000000,0.093023,0.059527,0.416661,0.000000,0.111115,0.000000,0.069763,0.000000,...,0.000000,0.097345,0.108283,0.060440,0.488379,0.000000,0.000000,0.031619,0.112364,0.052286
5,0.249994,0.175260,0.127904,0.059527,0.166667,0.129498,0.444437,0.000000,0.046509,0.348840,...,0.000000,0.000000,0.000000,0.005497,0.593030,0.000000,0.000000,0.110673,0.000000,0.143792
6,0.181817,0.134025,0.279068,0.059527,0.062496,0.100722,0.488890,0.000000,0.000000,0.325586,...,0.005524,0.000000,0.000000,0.027474,0.360462,0.000000,0.000000,0.003954,0.000000,0.163403
7,0.090903,0.164951,0.069768,0.309527,0.000000,0.431671,0.044441,0.000000,0.139538,0.209302,...,0.135359,0.000000,0.197450,0.000000,0.139538,0.000000,0.000000,0.015809,0.033711,0.326805
8,0.045451,0.041235,0.441859,0.107144,0.166667,0.014388,0.133336,0.000000,0.116284,0.139538,...,0.290054,0.000000,0.152870,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058825
9,0.000000,0.000000,0.034881,0.130952,0.583339,0.000000,0.177777,0.000000,0.139538,0.069763,...,0.552472,0.088494,0.025476,0.043954,0.162793,0.003220,0.000000,0.118578,0.000000,0.052286


In [10]:
species = train_df.species.unique()
species.sort()

predict = estimator.predict_proba(test_df.as_matrix())
result = DataFrame(predict,columns=species)

In [11]:
result.insert(0,'id',test_data.id)
result.to_csv('result8.csv',index=False)