## IMPORTING LIBRARIES

In [4]:
import mglearn as mg
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sl
import scipy as sp



 # IRIS Dataset

In [None]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()
iris_dataset.keys()
print(iris_dataset['DESCR'] + '\n')
print('First Five rows of data : \n {}'.format(iris_dataset['data'][:5]))
iris_dataset['target'].shape
iris_dataset['target_names']


# Test Train Separation

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris_dataset['data'],iris_dataset['target'],random_state = 0)
print("X_train shape = {}".format(X_train.shape))
df=pd.DataFrame(X_train,columns = iris_dataset.feature_names)

# Anomaly Detection

In [None]:
## Plotting HIST graphs to test data quality

pd.plotting.scatter_matrix(df,figsize=(15,15),diagonal='hist',hist_kwds={'bins':20},s=60)

In [None]:
# Anomaly Detection by finding outliers through SD,mean

anomalies = []
def find_anomalies(random_data):
    # Set upper and lower limit to 3 standard deviation
    random_data_std = np.std(random_data)
    random_data_mean = np.mean(random_data)
    anomaly_cut_off = random_data_std * 3
    
    lower_limit  = random_data_mean - anomaly_cut_off 
    upper_limit = random_data_mean + anomaly_cut_off
    # Generate outliers
    for outlier in random_data:
        if outlier > upper_limit or outlier < lower_limit:
            anomalies.append(outlier)
    return anomalies,lower_limit,upper_limit 

find_anomalies(df['sepal length (cm)'])


In [None]:
# Anomaly Detection by plotting box plot

import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(df['petal length (cm)'])

# K-nearest neighbours

In [None]:
#Classifier KNN

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1)   
knn.fit(X_train,y_train)

# Making Predictions

X_new = np.array([[5,2.9,1,0.2],[7.1,5.0,1,2.9]])
prediction = knn.predict(X_new)
format(iris_dataset['target_names'][prediction])

# Testing accuracy of model

prediction = knn.predict(X_test)
print('Accuracy : {}'.format(np.mean(prediction == y_test)))
knn.score(X_test,y_test)

In [None]:
# Scatter plot of data

X,y = mg.datasets.make_forge()
knn = KNeighborsClassifier(n_neighbors = 3) 
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0)
plt.scatter(X_train[:,0],X_train[:,1],c = y_train, marker = 'o')
plt.scatter(X_test[:,0],X_test[:,1],c = y_test,marker = 'v')
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

In [None]:
# Classification Decision Boundary

fig, ax = plt.subplots(1,4,figsize = (10,3))
from matplotlib.colors import ListedColormap
for n,ax in zip([1,3,9,15],ax):
    clf = KNeighborsClassifier(n)
    create_decision_boundary(clf = clf,X=X,y=y,cmap_light = ListedColormap(['orange', 'cornflowerblue']),
                             cmap_bold = ListedColormap(['darkorange','darkblue']),ax = ax)


In [None]:
# Regression KNN 

from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor(1)
X,y = mg.datasets.make_wave(n_samples = 40)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 1)

# here score calculates R squared
for i in np.arange(1,10,1):
    knr = KNeighborsRegressor(i)
    knr.fit(X_train,y_train)
    y = knr.score(X_test,y_test)
    z = knr.score(X_train,y_train)
    plt.plot(i,y,'bo')
    plt.plot(i,z,'rv')

# Creating decision boundary for KNN regressor
fig, ax = plt.subplots(1,4,figsize = (10,3))
z = np.linspace(-3,3,1000).reshape(-1,1)
from matplotlib.colors import ListedColormap
for n,ax in zip([1,3,9,15],ax):
    clf = KNeighborsRegressor(n)
    clf.fit(X,y)
    ax.plot(z,clf.predict(z))
    ax.plot(X,y,'ro')    


# Decision Boundaries Code

In [44]:
def create_decision_boundary(clf,X,y,cmap_light,cmap_bold,ax) :
    
    X = X[:, :2]
    h = .02  # step size in the mesh
    clf.fit(X, y)
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    ax.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training pointsqw
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    #ax.xlim(xx.min(), xx.max())
    #ax.ylim(yy.min(), yy.max())
    #plt.show()


# Synthetic Dataset

In [None]:
#Creating synthetic classification data

X,y = mg.datasets.make_forge()
df = pd.DataFrame(X)
#sns.boxplot(df[1])
#find_anomalies(df[1])
df.describe()
plt.scatter(df[0],df[1], c = y)
plt.legend(["class 0 ","class 1"],loc =4)
plt.show

In [None]:
#Creating synthetic regression data

X,y = mg.datasets.make_wave(n_samples = 40)
plt.plot(X,y,'o')
find_anomalies(X)
#sns.boxplot(X)

# Real world datasets

In [None]:
# Cancer dataset

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.data.shape
cancer.feature_names
#print('\n {}'.format(cancer.DESCR))
cancer.target
df = pd.DataFrame(cancer.data,columns = cancer.feature_names)
#pd.plotting.scatter_matrix(df,figsize=(15,15),diagonal='hist',hist_kwds={'bins':20},s=60)
#plt.show()
n,v = zip(cancer.target_names,np.bincount(cancer.target))
plt.scatter(df[cancer.feature_names[29]],df[cancer.feature_names[1]],c = cancer.target)

In [None]:
# Applying kneighbors on breast cancer

from sklearn.model_selection import train_test_split
knn = KNeighborsClassifier(n_neighbors = 1) 
X_train,X_test,y_train,y_test = train_test_split(cancer.data,cancer.target,random_state = 0)
knn.fit(X_train,y_train)
knn.predict(X_test)
knn.score(X_test,y_test)
for i in np.arange(1,10,1):
    knn = KNeighborsClassifier(n_neighbors = i) 
    knn.fit(X_train,y_train)
    y = knn.score(X_test,y_test)
    z = knn.score(X_train,y_train)
    plt.plot(i,y,'bo')
    plt.plot(i,z,'rv')



# Linear Regression

In [None]:
#mg.plots.plot_linear_regression_wave()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X,y = mg.datasets.make_wave(n_samples = 40)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0)
LR = LinearRegression()
type(LR)
y = LR.fit(X_train,y_train)
y.intercept_
y.coef_
LR.predict(X_test)
z = np.linspace(-5,5,1000).reshape(-1,1)
plt.plot(z,LR.predict(z),'b')
plt.plot(X_train,y_train,'ro')
plt.plot(X_test,y_test,'bo')
LR.score(X_test,y_test)
LR.score(X_train,y_train)

In [49]:
# Loading Boston Housing Datset 

X,y = mg.datasets.load_extended_boston()
X.shape
y.shape
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0)
lrfit = LR.fit(X_train,y_train) 
lrfit.coef_
lrfit.intercept_
LR.score(X_train,y_train)
LR.score(X_test,y_test)

0.607472195966589

In [3]:
# Ridge Regression

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
ridge = Ridge()
X,y = mg.datasets.load_extended_boston()
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0)
z = ridge.fit(X_train,y_train)
z.intercept_
ridge.score(X_test,y_test)
ridge.score(X_train,y_train)

0.8857966585170941

In [None]:
# Changing Alpha test

import math
alpha = [0.00001,0.01,1,10,100,1000]
for alpha in alpha:
    lasso = Lasso(alpha=alpha)
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train,y_train)
    lasso.fit(X_train,y_train)
    r1 = ridge.score(X_test,y_test)
    r2 = ridge.score(X_train,y_train)
    l1 = lasso.score(X_test,y_test)
    l2 = lasso.score(X_train,y_train)
    plt.plot(math.log10(alpha),r1,'bo')
    plt.plot(math.log10(alpha),r2,'b*')
    plt.plot(math.log10(alpha),l1,'co')
    plt.plot(math.log10(alpha),l2,'c*')

In [None]:
# coefficients comparison

from sklearn.linear_model import LinearRegression
ridge1 = Ridge(alpha=1)
ridge10 = Ridge(alpha=10)
ridge0_1 = Ridge(alpha=0.01)
LR = LinearRegression()
a= ridge1.fit(X_train,y_train)
b = ridge10.fit(X_train,y_train)
c = ridge0_1.fit(X_train,y_train)
d = LR.fit(X_train,y_train)
plt.plot(a.coef_,'bo')
plt.plot(b.coef_,'ro')
plt.plot(c.coef_,'go')
plt.plot(d.coef_,'co')
plt.ylim(-20,20)

In [15]:
# lasso Regression

from sklearn.linear_model import Lasso
ls = Lasso(alpha=0.01,max_iter=100000)
p = ls.fit(X_train,y_train)
p.coef_
p.intercept_
ls.score(X_train,y_train)
ls.score(X_test,y_test)

0.7656571174549983

In [36]:
# Elastic Net Combination of lasso and ridge

from sklearn.linear_model import ElasticNet
EN = ElasticNet(alpha=1,max_iter=10000,l1_ratio=0.0001)
EN.fit(X_train,y_train)q23456
EN.coef_
EN.score(X_test,y_test)

0.2756452492089402

 # Logistic Regresssion

In [None]:
from sklearn.linear_model import LogisticRegression
X,y = mg.datasets.make_forge()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0)
LG = LogisticRegression(C= 0.1,l1_ratio= 0.5,penalty= 'elasticnet',solver='saga',max_iter = 1000)
LG.fit(X_train,y_train)
print(LG.score(X_train,y_train))
print(LG.score(X_test,y_test))

In [None]:
# Making Decision Boundary for LogisticRegression

clf = LG.fit(X,y)
mg.plots.plot_2d_separator(clf,X,fill = False, eps = 0.5,alpha =0.7)
mg.discrete_scatter(X[:,0],X[:,1],y)

# Linear Support Vector machines

In [None]:
from sklearn.svm import LinearSVC
SVC = LinearSVC(C = 1,max_iter = 10000)
SVC.fit(X_train,y_train)
SVC.score(X_train,y_train)
SVC.score(X_test,y_test)

clf = SVC.fit(X,y)
mg.plots.plot_2d_separator(clf,X,fill = False, eps = 0.5,alpha =0.7)
mg.discrete_scatter(X[:,0],X[:,1],y)