In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
x = 1
y = 2
x
y

# Load sample dataset

In [None]:
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes

In [None]:
iris_dataset = load_iris()

In [None]:
iris_dataset

In [None]:
print(iris_dataset.DESCR)

In [None]:
iris_dataset.data


In [None]:
iris_dataset.target

In [None]:

iris_dataset.target_names

# Loading data from the web

In [None]:
from sklearn.datasets.california_housing import fetch_california_housing

In [None]:
houses = fetch_california_housing()

In [None]:
print(houses.DESCR)

In [None]:
houses.data.shape
houses.feature_names

In [None]:
data = load_diabetes()
print(data.target)

# Generate dataset

In [None]:
from sklearn.datasets import make_regression
x,y = make_regression(n_samples=100, n_features=1, noise=0.005)

In [None]:
x[0], y[0]

In [None]:
import seaborn as sns
sns.set(color_codes=True)
sns.regplot(x=x, y=y);

# Load data from openml.org

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
mice = fetch_openml(name='miceprotein', version=4)

In [None]:
mice.data[0]

In [None]:
import numpy as np

d = np.random.rand(3,10)

In [None]:
d.shape

In [None]:
mice.data.shape

In [None]:
print(mice.DESCR)

In [None]:
mice.target_names

In [None]:
import pandas as pd

# 5.3. Preprocessing data

## Standardization, or mean removal and variance scaling

Standardization of datasets is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

In [None]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

X_scaled      

#rows are different patients/subjects/flowers
# columns are various features of the patients/subjects/flowers


In [None]:
X_scaled.mean(axis=0)

In [None]:
idata = iris_dataset.data
id_scaled = preprocessing.scale(idata)
idata.mean(axis=0)
id_scaled.mean(axis=0)
id_scaled.std(axis=0)

In [None]:
X_train.mean(axis=0)
X_scaled.mean(axis=0)

In [None]:
X_scaled.std(axis=0)

In [None]:
X_scaled.mean(axis=1)

In [None]:
X_scaled.std(axis=1)

# Can save scaling and apply to testing data

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
                                      
scaler.transform(X_train)      

In [None]:
X_test = [[-1., 1., 0.]]
scaler.transform(X_test)  

 # Scaling features to a range

In [None]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax


In [None]:
X_test = np.array([[-3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax


# Pre-processing data - Non-linear transformation

In [None]:
import pandas as pd
%matplotlib inline
df = pd.read_csv('international-airline-passengers.csv')
display(df)

In [None]:
df.keys()

In [None]:
df['passengers'].hist(bins=20)

In [None]:
import numpy as np
df['passengers'] = np.log(df['passengers'])
df['passengers'].hist(bins=20)

# 5.3.3. Normalization

In [None]:
from sklearn import preprocessing
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l1')

X_normalized   
# http://www.chioka.in/differences-between-the-l1-norm-and-the-l2-norm-least-absolute-deviations-and-least-squares/

# Can save the normalization for future use

In [None]:
normalizer = preprocessing.Normalizer().fit(X)  # fit does nothing
normalizer

In [None]:
normalizer.transform(X)    

In [None]:
tmp = normalizer.transform([[2.,  1., 0.]]) 

In [None]:
tmp

In [None]:
tmp.mean()

In [None]:
(tmp*tmp).sum()

# Preprocessing data - Encoding

In [None]:
from sklearn import preprocessing
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)  

In [None]:
enc.transform([['female', 'from US', 'uses Safari']])

In [None]:
enc.transform([['male', 'from Europe', 'uses Safari']])

In [None]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X) 
enc.categories_


In [None]:
enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()

In [None]:
tmp = enc.transform([['female', 'from Asia', 'uses Chrome'],
                    ['male', 'from Europe', 'uses Safari']]).toarray()
tmp


In [None]:
[0, 1, 0, 0, 1, 0, 0, 0, 0, 1]

In [None]:
enc.inverse_transform(tmp)

# 5.3.5. Discretization

Discretization (otherwise known as quantization or binning) provides a way to partition continuous features into discrete values. Certain datasets with continuous features may benefit from discretization, because discretization can transform the dataset of continuous attributes to one with only nominal attributes.

One-hot encoded discretized features can make a model more expressive, while maintaining interpretability. For instance, pre-processing with a discretizer can introduce nonlinearity to linear models.

In [None]:
X = np.array([[ -3., 5., 15 ],
              [  0., 6., 14 ],
              [  6., 3., 11 ]])
est = preprocessing.KBinsDiscretizer(n_bins=[3, 2, 4], encode='ordinal').fit(X)

In [None]:
est.transform(X)

In [None]:
#https://scikit-learn.org/stable/modules/preprocessing.html#k-bins-discretization

# 5.4.2. Univariate feature imputation

In [None]:
# Example 1
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2],
         [np.nan, 3], 
         [7, 6]])  




X = [[np.nan, 2], 
     [6, np.nan], 
     [7, 6]]
print(imp.transform(X))      





In [None]:
11/3

In [None]:
import pandas as pd
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["a", np.nan],
                   ["b", "y"]], dtype="category")

imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df)) 

In [None]:
import pandas as pd
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["c", np.nan],
                   ["b", "y"]], dtype="category")

imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df)) 

# Splitting data into Train and Test

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), [0, 1, 0, 0, 1]
X
list(y)
# X -- feature
# y -- label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

X_train

y_train

X_test

y_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=43)

X_train

y_train

X_test

y_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

X_train

y_train

X_test

y_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=np.random)

X_train

y_train

X_test

y_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=np.random)

X_train

y_train

X_test

y_test

In [None]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [None]:
diabetes = datasets.load_diabetes()
diabetes.data.shape

In [None]:
feature_names = diabetes.feature_names
feature_names

In [None]:
print(diabetes.DESCR)

In [None]:
df = pd.DataFrame(diabetes.data, columns=feature_names)
y = diabetes.target
df
y

In [None]:
diabetes.target

In [None]:
df.shape

In [None]:
len(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=np.random)

X_train

y_train

X_test

y_test

In [None]:
X_train.shape

len(y_train)

X_test.shape

len(y_test)

# Linear Regression 
Ref: https://www.kaggle.com/getting-started/59856

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)
reg.score(X, y)

reg.coef_

reg.intercept_ 

reg.predict(np.array([[3, 5]]))


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score

In [None]:
dataset=pd.read_csv('Salary_Data.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values

In [None]:
X
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33)

In [None]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [None]:
y_pred=regressor.predict(X_test)
y_pred
y_test
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
r2_score(y_test, y_pred)

In [None]:
regressor.coef_

In [None]:
regressor.intercept_

In [None]:
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train,regressor.predict(X_train),color='blue')
plt.title('Salary VS Experience (Training Data)')
plt.xlabel('Years of experiene')
plt.ylabel('Salary')
plt.show()

In [None]:
plt.scatter(X_test,y_test,color='red')
plt.plot(X_test,regressor.predict(X_test),color='blue')
plt.title('Salary VS Experience (Test Data)');
plt.xlabel('Years of experiene');
plt.ylabel('Salary');
plt.show()

# Naive Bayes

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

In [None]:
# Load the data
from sklearn.datasets import load_iris
iris = load_iris()

from matplotlib import pyplot as plt

# The indices of the features that we are plotting
x_index = 0
y_index = 1

# this formatter will label the colorbar with the correct target names
formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])

plt.figure(figsize=(5, 4))
plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target)
plt.colorbar(ticks=[0, 1, 2], format=formatter)
plt.xlabel(iris.feature_names[x_index])
plt.ylabel(iris.feature_names[y_index])

plt.tight_layout()
plt.show()

In [None]:
irisdata = load_iris()

In [None]:
print(irisdata.DESCR)

In [None]:
irisdata.data

In [None]:
irisdata.target

In [None]:
X = irisdata.data
Y = irisdata.target

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)

In [None]:
100*metrics.accuracy_score(y_test, y_pred)

In [None]:
# 400 
380 # B
20 # M

380/400