In [43]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [44]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')
train_raw.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
columns = train_raw.columns
columns_X = columns[1:]
columns_y = columns[:1]

### Выбор модели

In [46]:
train, test = train_test_split(train_raw, test_size = 0.2)

In [47]:
train.shape

(33600, 785)

In [48]:
train_X = train[columns_X]
train_y = train[columns_y]

test_X = test[columns_X]
test_y = test[columns_y]

In [49]:
print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)

(33600, 784)
(33600, 1)
(8400, 784)
(8400, 1)


In [50]:
components_list = (10,20,30,40,50,60,70,80,90,100)
neighbors_list = (1,2,3,4,5)
res_df = pd.DataFrame(data = np.zeros([len(components_list)*len(neighbors_list),3]), columns = ['components','neighbors','accuracy'])

In [53]:
for i, i_num in enumerate(components_list):
    print(i, i_num)

0 10
1 20
2 30
3 40
4 50
5 60
6 70
7 80
8 90
9 100


In [54]:
%%time
for i_num, i in enumerate(components_list):
    pca = PCA(n_components=i)
    pca.fit(train_X)
    transform_train = pca.transform(train_X)
    transform_test = pca.transform(test_X)
    
    for j_num, j in enumerate(neighbors_list):
        clf = KNeighborsClassifier(n_neighbors=j)
        clf.fit(transform_train, train_y)
        results=clf.predict(transform_test)
        print('Accuracy for %d components and %d neighbors: %f' % (i, j, accuracy_score(results,test_y)))
        res_df['components'].loc[j_num+(i_num-1)*len(neighbors_list)] = i
        res_df['neighbors'].loc[j_num+(i_num-1)*len(neighbors_list)] = j
        res_df['accuracy'].loc[j_num+(i_num-1)*len(neighbors_list)] = accuracy_score(results,test_y)



Accuracy for 10 components and 1 neighbors: 0.910595
Accuracy for 10 components and 2 neighbors: 0.903452
Accuracy for 10 components and 3 neighbors: 0.920833
Accuracy for 10 components and 4 neighbors: 0.923095
Accuracy for 10 components and 5 neighbors: 0.925595
Accuracy for 20 components and 1 neighbors: 0.960952
Accuracy for 20 components and 2 neighbors: 0.958333
Accuracy for 20 components and 3 neighbors: 0.965119
Accuracy for 20 components and 4 neighbors: 0.964286
Accuracy for 20 components and 5 neighbors: 0.964524
Accuracy for 30 components and 1 neighbors: 0.970595
Accuracy for 30 components and 2 neighbors: 0.967381
Accuracy for 30 components and 3 neighbors: 0.970595
Accuracy for 30 components and 4 neighbors: 0.972262
Accuracy for 30 components and 5 neighbors: 0.971190
Accuracy for 40 components and 1 neighbors: 0.970595
Accuracy for 40 components and 2 neighbors: 0.968810
Accuracy for 40 components and 3 neighbors: 0.971667
Accuracy for 40 components and 4 neighbors: 0.

KeyboardInterrupt: 

In [55]:
res_df.head()

Unnamed: 0,components,neighbors,accuracy
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


### Лучшая модель

In [35]:
X_train = np.array(train_raw[columns_X])
X_test = np.array(test_raw)
y = np.array(train_raw[columns_y])

In [36]:
print(X_train.shape)
print(X_test.shape)
print(y.shape)

(42000, 784)
(28000, 784)
(42000, 1)


In [37]:
%%time
pca = PCA(n_components=40)
pca.fit(X_train)
transform_Xtrain = pca.transform(X_train)
transform_test = pca.transform(test_raw)

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(transform_Xtrain, y)
results=clf.predict(transform_test)



Wall time: 1min 28s


In [39]:
cnt = np.arange(1,X_test.shape[0]+1)
res = pd.DataFrame([cnt.T, results.T]).T
res.columns =['ImageId','Label']
form_name = 'results_comp'
res.to_csv(form_name+'.csv',index=False)