# Reference: Python Data Science Handbook

In [None]:
import seaborn as sns
import chardet
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
# check the data.csv files provided by seaborn
sns.get_dataset_names()

In [None]:
# load the iris.csv file from website
iris = sns.load_dataset('iris')

# save to local but without index
iris.to_csv('iris_local.csv',index=False)
iris.head()

In [None]:
# if open the file from local, we can first check the encoding format
with open('iris_local.csv','rb') as rawdata:
    print(chardet.detect(rawdata.read(100000)))

In [None]:
# open the local iris file with ascii
iris_local = pd.read_csv('iris_local.csv',encoding='ascii')
iris_local.head()

In [None]:
# check the summary information
iris.describe()

In [None]:
# check NaN in each column (features)
iris.isna().sum()

In [None]:
# the feature matrix is usually a two dimensional matrix [n_samples, n_features]: X
# the label array is uaually a one dimensional array n_samples: Y
# visualization with sns
sns.set() # before ploting, set initial parameters
sns.pairplot(iris,hue="species",size = 3) 


In [None]:
X_iris = iris.drop('species',axis=1) #drow column species
X_iris.shape # generate feature matrix

In [None]:
y_iris = iris['species']
y_iris.shape # generate label array

In [None]:
# set random seed
np.random.seed(1)
x = 10*np.random.rand(50) # generate 50 point from 0 to 10
y = 2*x -1 + np.random.randn(50) # generate y add Gaussian noise

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(x,y)

In [None]:
# linear regression example, model.fit and model.predict
# Choose a class of model
from sklearn.linear_model import LinearRegression
# generate model instance
model = LinearRegression(fit_intercept=True) # y = ax + b we want b
model

In [None]:
x.shape

In [None]:
# recall the feature matrix must has two dimensions 
X = x.reshape(50,1)
X.shape

In [None]:
# fit the model, generate parameters automatically
model.fit(X,y)

In [None]:
print(model.coef_) # the true value should be 2
print(model.intercept_) # the true value should be -1

In [None]:
# predict labels for unknown data
xfit = np.linspace(-1,11)
Xfit = xfit.reshape(-1,1)
yfit = model.predict(Xfit)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(X,y)
plt.plot(Xfit,yfit)

In [None]:
# Use Gaussian naive Bayes is often a good model to use as a baseline classification
# Because it is fast and no hyperparameters to choose
# For the dataset, we need to split it into training and test set

In [None]:
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris,y_iris,random_state = 1)

In [None]:
from sklearn.naive_bayes import GaussianNB # choose the model class
model = GaussianNB() # generate model instance
model.fit(Xtrain,ytrain)
y_predict = model.predict(Xtest)

In [None]:
# see the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_score(ytest,y_predict)

In [None]:
# iris data has four features thus it is not good for visualization
# unsupervised learning to reduce the dimension of the data
from sklearn.decomposition import PCA
model = PCA(n_components=2)
model.fit(X_iris)               # generate model paramters
X_2D = model.transform(X_iris)  # transfer the data to 2d

In [None]:
# add pca features to the original dataset
iris['PCA1'] = X_2D[:,0]
iris['PCA2'] = X_2D[:,1]
sns.lmplot('PCA1','PCA2',hue='species',data=iris,fit_reg=False)

In [None]:
from sklearn.mixture import GMM
model = GMM(n_components = 3, covariance_type = 'full')
model.fit(X_iris)
y_gmm = model.predict(X_iris)

In [None]:
y_gmm

In [None]:
iris['cluster'] = y_gmm
sns.lmplot('PCA1','PCA2',data=iris,hue='species',col='cluster',fit_reg=False)

In [None]:
# load and visualizing the digits data
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape

In [None]:
fig, axes = plt.subplots(10,10,figsize=(8,8),subplot_kw={'xticks':[],'yticks':[]})
for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i],cmap='binary')
    ax.text(0.05,0.05,str(digits.target[i]),color='green')

In [None]:
X = digits.data
print(X.shape)
y = digits.target
print(y.shape)

In [None]:
# dimensionality reduction, not via PCA
from sklearn.manifold import Isomap
model = Isomap(n_components=2)
model.fit(X)
data_projected = model.transform(X)
data_projected.shape

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(data_projected[:,0],data_projected[:,1],c=y, edgecolor='none', alpha=0.5,cmap=plt.cm.get_cmap('nipy_spectral',10))
plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5,9.5)

In [None]:
# use Gaussian NB to do classification
X_train,X_test, y_train, y_test = train_test_split(X,y,random_state=1)
model = GaussianNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test,y_pred)
sns.heatmap(mat,square=True,annot=True,cbar=False)
plt.xlabel('predicted value')
plt.ylabel('true value')