# Scikit-learn


#Scikit-learning là một thư viện Python mã nguồn mở triển khai một loạt các thuật toán học máy, tiền xử lý, xác nhận chéo và trực quan hóa bằng cách sử dụng một giao diện thống nhất

# A Basic Example

In [1]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.631578947368421

# 1. Loading The Data

#Your data needs to be numeric and stored as NumPy arrays or SciPy sparse 
matrices. Other types that are convertible to numeric arrays, such as Pandas 
DataFrame, are also acceptable

In [2]:
import numpy as np
X = np.random.random((10,5))
y = np.array(['M','M','F','F','M','F','M','M','F','F','F'])
X[X < 0.7] = 0

# 2. Training And Test Data

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

ValueError: Found input variables with inconsistent numbers of samples: [10, 11]

# 3. Preprocessing The Data

# 3.1 Standardization

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

# 3.2 Normalization

In [5]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

# 3.3 Binarization

In [6]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

# 3.4 Encoding Categorical Features

In [7]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)

# 3.5 Imputing Missing Values

In [8]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)

ImportError: cannot import name 'Imputer' from 'sklearn.preprocessing' (C:\Users\Admin\anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py)

# 3.6 Generating Polynomial Features

In [9]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X) 

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.97985193, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.85966937, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.8497102 , 0.82639196,
        0.80371364],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.72522627, ..., 0.        , 0.        ,
        0.        ]])

# 4. Create Your Model

# 4.1 Supervised Learning Estimators

#Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

#Support Vector Machines (SVM)

In [11]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')

#Naive Bayes 

In [12]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

#KNN

In [13]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

# 4.2 Unsupervised Learning Estimators

# 4.2.1 Principal Component Analysis (PCA)

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

# 4.2.2 K Means

In [15]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

# 5. Model Fitting

# 5.1 Supervised learning

In [16]:
lr.fit(X, y) #Fit the model to the data
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [10, 11]

# 5.2 Unsupervised Learning

In [17]:
k_means.fit(X_train)                   #Fit the model to the data                                 
pca_model = pca.fit_transform(X_train) # Fit to data, then transform it

# 6. Prediction

# 6.1 Supervised Estimators

In [18]:
y_pred = svc.predict(np.random.random((2,5))) #Predict labels
y_pred = lr.predict(X_test) #Predict labels
y_pred = knn.predict_proba(X_test)#Predict labels

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# 6.2 Unsupervised Estimators

In [19]:
y_pred = k_means.predict(X_test) #Predict labels in clustering algos

# 7. Evaluate Your Model’s Performance

# 7.1 Classification Metrics

#Accuracy Score

In [20]:
knn.score(X_test, y_test)   #Estimator score method
from sklearn.metrics import accuracy_score #Metric scoring functions 
accuracy_score(y_test, y_pred)

NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

#Classification Report

In [21]:
from sklearn.metrics import classification_report         #Precision, recall, f1-score
print(classification_report(y_test, y_pred))              #and suppor

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.00      0.00      0.00        11
           2       0.71      0.63      0.67        19

    accuracy                           0.32        38
   macro avg       0.24      0.21      0.22        38
weighted avg       0.35      0.32      0.33        38



#Confusion Matrix

In [22]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[ 0  8  0]
 [ 6  0  5]
 [ 7  0 12]]


# 7.2 Regression Metrics

#Mean Absolute Error

In [23]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2]
mean_absolute_error(y_true, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [3, 38]

#Mean Squared Error

In [24]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

1.236842105263158

#R² Score

In [25]:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [3, 38]

# 7.3 Clustering Metrics

#Adjusted Rand Index

In [26]:
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)



ValueError: Found input variables with inconsistent numbers of samples: [3, 38]

#Homogeneity

In [27]:
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred) 



ValueError: Found input variables with inconsistent numbers of samples: [3, 38]

#V-measure

In [28]:
from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_pred) 

NameError: name 'metrics' is not defined

# 7.4 Cross-Validation

In [29]:
from sklearn.cross_validation import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))

ModuleNotFoundError: No module named 'sklearn.cross_validation'

# 8. Tune Your Model

# 8.1 Grid Search

In [30]:
from sklearn.grid_search import GridSearchCV
params = {"n_neighbors": np.arange(1,3), "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn, 
param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

ModuleNotFoundError: No module named 'sklearn.grid_search'

# 8.2 Randomized Parameter Optimization

In [31]:
from sklearn.grid_search import RandomizedSearchCV
params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params, cv=4,n_iter=8, random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)

ModuleNotFoundError: No module named 'sklearn.grid_search'