In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.model_selection import train_test_split

In [89]:
tr_data = pd.read_csv('../input/train.csv')
te_data = pd.read_csv('../input/test.csv')
print('train shape is: {} \r\ntest shape is: {} '.format(tr_data.shape,te_data.shape))

train shape is: (61878, 95) 
test shape is: (144368, 94) 


In [90]:
from sklearn.feature_extraction.text import TfidfTransformer

y = tr_data.target
tr_data.drop(['id','target'],axis=1,inplace=True)
te_data.drop(['id'],axis=1,inplace=True)

In [92]:
tfidf = TfidfTransformer()
tr_data_tfidf = pd.DataFrame(tfidf.fit_transform(tr_data).toarray())
te_data_tfidf = pd.DataFrame(tfidf.transform(te_data).toarray())
tr_data_tfidf.columns = [str(x)+'tfidf' for x in tr_data_tfidf.columns]
te_data_tfidf.columns = [str(x)+'tfidf' for x in te_data_tfidf.columns]

tr_data_log1p = pd.DataFrame(tr_data.apply(lambda x: np.log1p(x)))
te_data_log1p = pd.DataFrame(te_data.apply(lambda x: np.log1p(x)))
tr_data_log1p.columns = [str(x)+'log1p' for x in tr_data_log1p.columns]
te_data_log1p.columns = [str(x)+'log1p' for x in te_data_log1p.columns]

tr_data_comb = pd.concat([tr_data,tr_data_tfidf,tr_data_log1p],axis=1)
te_data_comb = pd.concat([te_data,te_data_tfidf,te_data_log1p],axis=1)

In [93]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [95]:
import xgboost as xgb

params = {'objective':'multi:softprob',
                    'learning_rate':0.1,
                    'subsample':0.7,
                    'colsample_bytree':0.9,
                    'colsample_bylevel':0.7,
                    'max_depth':7,
                    'nthread':4,
                    'eval_metric':'mlogloss',
                    'num_class':9,
                    'seed':1234}

bst_cv = xgb.cv(params=params,dtrain=xgb.DMatrix(tr_data_comb,label=y_encoded),verbose_eval=2,
                nfold=5,early_stopping_rounds=20,num_boost_round=300)
 

[0]	train-mlogloss:1.93891+0.000549932	test-mlogloss:1.94651+0.00101682
[2]	train-mlogloss:1.60992+0.000297799	test-mlogloss:1.62966+0.00205509
[4]	train-mlogloss:1.39041+0.000374844	test-mlogloss:1.41948+0.00240912
[6]	train-mlogloss:1.22926+0.000876244	test-mlogloss:1.26626+0.00267889
[8]	train-mlogloss:1.10409+0.00100272	test-mlogloss:1.14804+0.00279386
[10]	train-mlogloss:1.00329+0.00122976	test-mlogloss:1.05369+0.00281979
[12]	train-mlogloss:0.92073+0.00129024	test-mlogloss:0.977189+0.00307157
[14]	train-mlogloss:0.852004+0.00133515	test-mlogloss:0.914367+0.00341608
[16]	train-mlogloss:0.794623+0.00119918	test-mlogloss:0.862261+0.00358521
[18]	train-mlogloss:0.745281+0.00138362	test-mlogloss:0.818268+0.00370527
[20]	train-mlogloss:0.703261+0.00155351	test-mlogloss:0.781197+0.00372586
[22]	train-mlogloss:0.667504+0.001804	test-mlogloss:0.750237+0.00360492
[24]	train-mlogloss:0.635793+0.00194436	test-mlogloss:0.723232+0.00377138
[26]	train-mlogloss:0.608901+0.00180291	test-mlogloss:

[222]	train-mlogloss:0.226438+0.00109861	test-mlogloss:0.474099+0.0045886
[224]	train-mlogloss:0.225254+0.00107641	test-mlogloss:0.473877+0.00453998
[226]	train-mlogloss:0.224125+0.00113173	test-mlogloss:0.473685+0.00452903
[228]	train-mlogloss:0.222958+0.00120603	test-mlogloss:0.473495+0.00457687
[230]	train-mlogloss:0.221642+0.0012641	test-mlogloss:0.473263+0.00465036
[232]	train-mlogloss:0.220431+0.00127763	test-mlogloss:0.472993+0.00474374
[234]	train-mlogloss:0.21918+0.00121132	test-mlogloss:0.472773+0.00471487
[236]	train-mlogloss:0.218028+0.00115937	test-mlogloss:0.472638+0.00470243
[238]	train-mlogloss:0.216864+0.00108937	test-mlogloss:0.472398+0.00478139
[240]	train-mlogloss:0.215707+0.00102855	test-mlogloss:0.472269+0.00481081
[242]	train-mlogloss:0.214491+0.00104122	test-mlogloss:0.472014+0.0049351
[244]	train-mlogloss:0.213267+0.00107247	test-mlogloss:0.471788+0.00492769
[246]	train-mlogloss:0.212229+0.00105273	test-mlogloss:0.471597+0.00490356
[248]	train-mlogloss:0.211066

In [96]:
bst = xgb.train(params=params,dtrain=xgb.DMatrix(tr_data_comb,label=y_encoded),num_boost_round=300)

In [97]:
pred = bst.predict(xgb.DMatrix(te_data_comb))
subm = pd.DataFrame(pred)
subm.columns = ['class_'+ str(x) for x in range(1,10)]
subm['id'] = pd.read_csv('../input/test.csv',usecols=['id'])
#subm.index_label = 'id'
subm.to_csv('../subm/tfidf_log1p_raw_xgb_sub1.csv',index=False)

#### let's ignore the tfidf transform for and see what you could also come up with based on the things we learned...

In [98]:
tr_data = pd.read_csv('../input/train.csv')
te_data = pd.read_csv('../input/test.csv')

y = tr_data.target
tr_data.drop(['id','target'],axis=1,inplace=True)
te_data.drop(['id'],axis=1,inplace=True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [99]:
tr_data['count_zeros'] = (tr_data == 0).astype(int).sum(axis=1)
te_data['count_zeros'] = (te_data == 0).astype(int).sum(axis=1)

tr_data['num_greater_than_3'] = (tr_data > 3).astype(int).sum(axis=1)
te_data['num_greater_than_3'] = (te_data > 3).astype(int).sum(axis=1)

tr_data['num_greater_than_10'] = (tr_data > 10).astype(int).sum(axis=1)
te_data['num_greater_than_10'] = (te_data > 10).astype(int).sum(axis=1)

In [113]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,log_loss
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(tr_data,y_encoded,test_size = 0.2,random_state =12345)

knn2 = KNeighborsClassifier(n_jobs=4,n_neighbors=2,)
knn2.fit(X_train,y_train)
knn2_pred = knn2.predict_proba(X_val)
print('log loss for knn2: {}'.format(log_loss(y_pred = knn2_pred,y_true = y_val)))

knn4 = KNeighborsClassifier(n_jobs=4,n_neighbors=4,)
knn4.fit(X_train,y_train)
knn4_pred = knn4.predict_proba(X_val)
print('log loss for knn4: {}'.format(log_loss(y_pred = knn4_pred,y_true = y_val)))

knn8 = KNeighborsClassifier(n_jobs=8,n_neighbors=8,)
knn8.fit(X_train,y_train)
knn8_pred = knn8.predict_proba(X_val)
print('log loss for knn8: {}'.format(log_loss(y_pred = knn8_pred,y_true = y_val)))

knn16 = KNeighborsClassifier(n_jobs=4,n_neighbors=16,)
knn16.fit(X_train,y_train)
knn16_pred = knn16.predict_proba(X_val)
print('log loss for knn16: {}'.format(log_loss(y_pred = knn16_pred,y_true = y_val)))

knn32 = KNeighborsClassifier(n_jobs=4,n_neighbors=32,)
knn32.fit(X_train,y_train)
knn32_pred = knn32.predict_proba(X_val)
print('log loss for knn32: {}'.format(log_loss(y_pred = knn32_pred,y_true = y_val)))

knn64 = KNeighborsClassifier(n_jobs=4,n_neighbors=64,)
knn64.fit(X_train,y_train)
knn64_pred = knn64.predict_proba(X_val)
print('log loss for knn64: {}'.format(log_loss(y_pred = knn64_pred,y_true = y_val)))

knn128 = KNeighborsClassifier(n_jobs=4,n_neighbors=128,)
knn128.fit(X_train,y_train)
knn128_pred = knn128.predict_proba(X_val)
print('log loss for knn128: {}'.format(log_loss(y_pred = knn128_pred,y_true = y_val)))

log loss for knn2: 4.8441469936134425
log loss for knn4: 2.6644552310076475
log loss for knn8: 1.5318682781966835
log loss for knn16: 0.9965564078189585
log loss for knn32: 0.7943305611598044
log loss for knn64: 0.7002169581175647
log loss for knn128: 0.6943451757833374


In [101]:
class_weights = {0:1,1:1,2:1,3:10,4:1,5:1,6:1,7:1,8:1}
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(class_weight=class_weights,max_depth=4,max_features=92,min_samples_split=2,random_state=12345)
dtc.fit(X_train,y_train)
tree_pred = dtc.predict_proba(X_val)
print('log loss for dtc: {}'.format(log_loss(y_pred = tree_pred,y_true = y_val)))

log loss for dtc: 1.647334289958738


In [102]:
# class_weights = {0:1,1:1,2:1,3:1,4:1,5:1,6:10,7:1,8:1}
# lets remove the class weights and check our score...
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=4,max_features=92,min_samples_split=2,random_state=12345)
dtc.fit(X_train,y_train)
tree_pred = dtc.predict_proba(X_val)
print('log loss for dtc: {}'.format(log_loss(y_pred = tree_pred,y_true = y_val)))

log loss for dtc: 1.4348667288258288


In [114]:
from sklearn.svm import SVC
svc = SVC(kernel='linear',C=0.1,max_iter=10000,random_state=12345,probability=True)
svc.fit(X_train,y_train)
svc_pred = svc.predict_proba(X_val)
print('log loss for svc: {}'.format(log_loss(y_pred = svc_pred,y_true = y_val)))



log loss for svc: 0.6653540066424208


In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf',C=0.1,max_iter=10000,random_state=12345,probability=True,)
svc.fit(X_train,y_train)
svc_pred = svc.predict_proba(X_val)
print('log loss for svc: {}'.format(log_loss(y_pred = svc_pred,y_true = y_val)))

In [109]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=4,n_estimators=300)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict_proba(X_val)
print('log loss for RFC: {}'.format(log_loss(y_pred = rfc_pred,y_true = y_val)))

log loss for RFC: 0.5726116665922214


In [61]:
rfc_test_pred = rfc.predict_proba(te_data)

In [110]:
import xgboost as xgb

params = {'objective':'multi:softprob',
                    'learning_rate':0.2,
                    'subsample':0.9,
                    'colsample_bytree':0.9,
                    'colsample_bylevel':0.7,
                    'max_depth':5,
                    'nthread':4,
                    'eval_metric':'mlogloss',
                    'n_estimators':100,
                    'num_class':9,
                    'seed':1234}

bst = xgb.train(params=params,dtrain=xgb.DMatrix(X_train,label=y_train),num_boost_round=300)

In [111]:
xgb_pred = bst.predict(xgb.DMatrix(X_val))
print('log loss for XGB: {}'.format(log_loss(y_pred = xgb_pred,y_true = y_val)))

log loss for XGB: 0.46402587516456023


In [62]:
xgb_test_pred = bst.predict(xgb.DMatrix(te_data))
combined_pred = xgb_test_pred *0.85 + rfc_test_pred *0.15
subm = pd.DataFrame(combined_pred)
subm.columns = ['class_'+ str(x) for x in range(1,10)]
subm['id'] = pd.read_csv('../input/test.csv',usecols=['id'])
#subm.index_label = 'id'
subm.to_csv('../subm/rf_xgb_sub1.csv',index=False)