In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### **Importing the necessary libraries**

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA

from matplotlib.pyplot import imshow

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import cv2
%matplotlib inline 


### **Loading the testing and training data**

In [3]:
train_data=pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
test_data=pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

In [4]:
print(train_data.shape)
print(test_data.shape)

In [5]:
##Viewing the training data
train_data.head()

In [6]:
#viewing the number of unique values in the label column of train_data
train_data.label.unique()

### **Viewing some example images**

In [7]:
rows = 3
cols = 3
axes=[]

fig=plt.figure()
fig.set_size_inches(8,10)
for i in range(rows*cols):
    sample=np.reshape(train_data[train_data.columns[1:]].iloc[i].values/255,(28,28))
    axes.append(fig.add_subplot(rows,cols,i+1))
    plt.title("Label : {}".format(train_data["label"].iloc[i]))
    plt.imshow(sample, 'gray')
plt.show()

In [8]:
plt.figure(figsize=(6,7))
ax=sns.countplot(x='label', data=train_data)

plt.title("Records per label")

for i in ax.patches:
    xp = f'{100 * i.get_height() / len(train_data.label):.1f}%\n'
    x = i.get_x() + i.get_width() / 2
    y = i.get_height()
    ax.annotate(xp, (x, y), ha='center', va='center')

### As we can see almost all the labels have more or less same frequency in the training data with '1' having the highest frequency and '5' having the lowest frequency

In [9]:
feats=train_data.drop('label',axis=1)
labels=train_data['label']

print(feats.shape)
print(labels.shape)

### **Preprocessing the data**

In [10]:
feats=feats/255
test_data=test_data/255

In [11]:
x_train, x_test, y_train, y_test=train_test_split(feats, labels, test_size=0.2,random_state=1)

In [12]:
print("Train Data Shape: ",x_train.shape)
print("Train Label Shape: ",y_train.shape)
print("Test Data Shape: ",x_test.shape)
print("Test Label Shape: ",y_test.shape)

In [13]:
bx_train=x_train
bx_test=x_test
bx_train[bx_train>0]=1
bx_test[bx_test>0]=1

In [14]:
#Making the model
clf = svm.SVC(random_state=42)
print(clf.get_params)

### **Converting the images to binary images and viewing it**

In [15]:
for x in range(0,4):
    train_0=bx_train[y_train==x]
    data_new=[]
    for idx in train_0.index:
        val=train_0.loc[idx].values.reshape(28,28)
        data_new.append(val)
    plt.figure(figsize=(25,25))   
    for x in range(1,5):
        ax1=plt.subplot(1, 20, x)
        ax1.imshow(data_new[x],cmap='binary')

In [16]:
clf.fit(bx_train, y_train.values.ravel())

In [17]:
score=clf.score(bx_test,y_test)
print("Accuracy for binary: ",score)

### **Standardizing the data**

In [18]:
#standardized data
sc = StandardScaler().fit(bx_train)
X_std_train = sc.transform(bx_train)
X_std_test = sc.transform(bx_test)

sklearn_pca = sklearnPCA().fit(X_std_train)

var_per = sklearn_pca.explained_variance_ratio_
cum_var_per = sklearn_pca.explained_variance_ratio_.cumsum()

plt.figure(figsize=(30,10))
ind = np.arange(len(var_per)) 
plt.bar(ind,var_per)
plt.xlabel('total components')
plt.ylabel('variance')

In [19]:
n_comp=len(cum_var_per[cum_var_per <= 0.90])
print("Keeping 90% Info with ",n_comp," components")
sklearn_pca = sklearnPCA(n_components=n_comp)
train_pca_b = sklearn_pca.fit_transform(X_std_train)
test_pca_b = sklearn_pca.transform(X_std_test)
print("Shape before PCA for Train: ",X_std_train.shape)
print("Shape after PCA for Train: ",train_pca_b.shape)
print("Shape before PCA for Test: ",X_std_test.shape)
print("Shape after PCA for Test: ",test_pca_b.shape)

In [20]:
clf.fit(train_pca_b, y_train.values.ravel())

In [21]:
score=clf.score(test_pca_b,y_test)
print("Accuracy for grayscale: ",score)

### **Retraining the data**

In [22]:
feats[feats>0]=1
test_data[test_data>0]=1

In [23]:
#standardize data
sc = StandardScaler().fit(feats)
X_std_train = sc.transform(feats)
X_std_test = sc.transform(test_data)

#If n_components is not set then all components are stored 
sklearn_pca = sklearnPCA().fit(X_std_train)

#Percentage of variance explained by each of the selected components.
#If n_components is not set then all components are stored and the sum of the ratios is equal to 1.0.
var_per = sklearn_pca.explained_variance_ratio_
cum_var_per = sklearn_pca.explained_variance_ratio_.cumsum()


In [24]:
n_comp=len(cum_var_per[cum_var_per <= 0.90])
print("Keeping 90% Info with ",n_comp," components")
sklearn_pca = sklearnPCA(n_components=n_comp)
train_pca_b = sklearn_pca.fit_transform(X_std_train)
test_pca_b = sklearn_pca.transform(X_std_test)
print("Shape before PCA for Train: ",X_std_train.shape)
print("Shape after PCA for Train: ",train_pca_b.shape)
print("Shape before PCA for Test: ",X_std_test.shape)
print("Shape after PCA for Test: ",test_pca_b.shape)

In [25]:
clf.fit(train_pca_b, labels.values.ravel())

In [None]:
answer=clf.predict(test_pca_b)
submission = pd.DataFrame({'ImageId':test_data.index.values+1,'Label':answer})
submission.index=submission['ImageId'].values
submission.to_csv('fin.csv', index = False)