# Data Modelling

## Loading the dataset 

In [11]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
Y = iris.target
print("Feature_Name",iris.feature_names)
print("Target_Name",iris.target_names)
print("Sample Features",X[:2])
print("Sample Target",Y[:2])

Feature_Name ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target_Name ['setosa' 'versicolor' 'virginica']
Sample Features [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]]
Sample Target [0 0]


### Splitting the dataset

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

print("Training dataset shape",X_train.shape)
print("Testing dataset shape",X_test.shape)

print("Training dataset shape",Y_train.shape)
print("Testing dataset shape",Y_test.shape)

Training dataset shape (120, 4)
Testing dataset shape (30, 4)
Training dataset shape (120,)
Testing dataset shape (30,)


### Training Model

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

classifier_knn = KNeighborsClassifier(n_neighbors=3)
classifier_knn.fit(X_train,Y_train)
y_pred = classifier_knn.predict(X_test)

#Finding Accuracy
print("Accuracy:",metrics.accuracy_score(Y_test,y_pred))

#providing sample data 
sample = [[1,2,3,4],[2,4,6,8]]
preds = classifier_knn.predict(sample)
pred_species = [iris.target_names[x] for x in preds]
print("Predictions:",pred_species)

Accuracy: 1.0
Predictions: ['versicolor', 'virginica']


### Model Persistence 

In [14]:
#to avoid retraining of model again and again
import joblib
joblib.dump(classifier_knn,'iris_classifier_knn.joblib')

#save the model 
#load the model
joblib.load('iris_classifier_knn.joblib')

KNeighborsClassifier(n_neighbors=3)

### Preprocessing the Data

### Binarisation
Convert numerical value to binary

In [17]:
import numpy as np
dataset = np.array([[1,1.1,2.1,3],[4,0,-1,2]])
from sklearn import preprocessing

data_binarized = preprocessing.Binarizer(threshold = 0.5).transform(dataset)
print("Binarized Data:",data_binarized)

Binarized Data: [[1. 1. 1. 1.]
 [1. 0. 0. 1.]]


## Mean Removal
Eliminate the mean from feature vector so that every feature centered on zero.

In [19]:
import numpy as np
dataset = np.array([[1,1.1,2.1,3],[4,0,-1,2]])
from sklearn import preprocessing

#display mean & std
print("Mean=",dataset.mean(axis=0))
print("Standard Deviation=",dataset.std(axis=0))

#Removing meann & std
data_scaled = preprocessing.scale(dataset)
print("Mean_Removed =",data_scaled.mean(axis=0))
print("StandardDeviation_Removed =",data_scaled.std(axis=0))


Mean= [2.5  0.55 0.55 2.5 ]
Standard Deviation= [1.5  0.55 1.55 0.5 ]
Mean_Removed = [0. 0. 0. 0.]
StandardDeviation_Removed = [1. 1. 1. 1.]


## Scaling

In [21]:
dataset = np.array([[1,1.1,2.1,3],[4,0,-1,2]])

data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled_minmax = data_scaler_minmax.fit_transform(dataset)
print ("\nMin max scaled data:\n", data_scaled_minmax)


Min max scaled data:
 [[0. 1. 1. 1.]
 [1. 0. 0. 0.]]
