<a href="https://colab.research.google.com/github/mariuszkr33/dw_matrix/blob/master/sklearn3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


data_raw = load_iris()
data = data_raw['data']
target = data_raw['target']
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.3, random_state=20)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(data_train,target_train)
accuracy = model.score(data_test, target_test)
accuracy

0.9333333333333333

In [None]:
print(f'accuracy: {accuracy:.4f}')

accuracy: 0.9333


In [None]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
target_pred = model.predict(data_test)
print(target_pred)

[0 1 1 2 1 1 2 0 2 0 2 1 1 0 0 2 0 1 2 1 1 2 2 0 1 1 1 0 2 1 1 1 0 0 0 1 1
 0 1 2 1 2 0 1 1]


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(target_test,target_pred)
cm

array([[13,  0,  0],
       [ 0, 18,  0],
       [ 0,  3, 11]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(target_test, target_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.86      1.00      0.92        18
           2       1.00      0.79      0.88        14

    accuracy                           0.93        45
   macro avg       0.95      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



In [None]:
import tensorflow as tf


print(tf.__version__)
import pandas as pd
from sklearn.preprocessing import LabelEncoder


data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df = pd.DataFrame(data=data)
for col in ['size', 'color', 'gender', 'bought']:
    df[col] = df[col].astype('category')
df['weight'] = df['weight'].astype('float')
df

2.2.0


Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


In [None]:
le = LabelEncoder()
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,380.0,0
4,M,red,female,79.0,410.0,1


In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse = False)
encoder.fit(df[['size']])
print(encoder.transform(df[['size']]))
print(encoder.categories_)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]
[array(['L', 'M', 'XL'], dtype=object)]


In [None]:
from sklearn.datasets import load_breast_cancer

raw_data = load_breast_cancer()
print(raw_data['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [None]:
import numpy as np

np.set_printoptions(precision=2, suppress=True, linewidth=100)

In [None]:
data = raw_data['data']
target = raw_data['target']
 
print(data[:3])

[[  17.99   10.38  122.8  1001.      0.12    0.28    0.3     0.15    0.24    0.08    1.09    0.91
     8.59  153.4     0.01    0.05    0.05    0.02    0.03    0.01   25.38   17.33  184.6  2019.
     0.16    0.67    0.71    0.27    0.46    0.12]
 [  20.57   17.77  132.9  1326.      0.08    0.08    0.09    0.07    0.18    0.06    0.54    0.73
     3.4    74.08    0.01    0.01    0.02    0.01    0.01    0.     24.99   23.41  158.8  1956.
     0.12    0.19    0.24    0.19    0.28    0.09]
 [  19.69   21.25  130.   1203.      0.11    0.16    0.2     0.13    0.21    0.06    0.75    0.79
     4.58   94.03    0.01    0.04    0.04    0.02    0.02    0.     23.57   25.53  152.5  1709.
     0.14    0.42    0.45    0.24    0.36    0.09]]


In [None]:
all_data = np.c_[data, target]
print(all_data[:3])

[[  17.99   10.38  122.8  1001.      0.12    0.28    0.3     0.15    0.24    0.08    1.09    0.91
     8.59  153.4     0.01    0.05    0.05    0.02    0.03    0.01   25.38   17.33  184.6  2019.
     0.16    0.67    0.71    0.27    0.46    0.12    0.  ]
 [  20.57   17.77  132.9  1326.      0.08    0.08    0.09    0.07    0.18    0.06    0.54    0.73
     3.4    74.08    0.01    0.01    0.02    0.01    0.01    0.     24.99   23.41  158.8  1956.
     0.12    0.19    0.24    0.19    0.28    0.09    0.  ]
 [  19.69   21.25  130.   1203.      0.11    0.16    0.2     0.13    0.21    0.06    0.75    0.79
     4.58   94.03    0.01    0.04    0.04    0.02    0.02    0.     23.57   25.53  152.5  1709.
     0.14    0.42    0.45    0.24    0.36    0.09    0.  ]]


In [None]:
import pandas as pd

pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 200)

df = pd.DataFrame(data = all_data, columns = list(raw_data['feature_names']) + ['target'])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,...,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,...,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,...,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,...,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,...,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,...,0.4,0.1625,0.2364,0.07678,0.0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=40, test_size=0.25)
 
print(f'X_train shape {X_train.shape}')
print(f'y_train shape {y_train.shape}')
print(f'X_test shape {X_test.shape}')
print(f'y_test shape {y_test.shape}')

X_train shape (426, 30)
y_train shape (426,)
X_test shape (143, 30)
y_test shape (143,)


In [None]:
for name, array in zip(['target', 'y_train', 'y_test'], [target, y_train, y_test]):
    print(f'{name.ljust(7)}:{np.unique(array, return_counts=True)[1] / len(array)}')

target :[0.37 0.63]
y_train:[0.39 0.61]
y_test :[0.31 0.69]


In [None]:
for name, array in zip(['target', 'y_train', 'y_test'], [target, y_train, y_test]):
    print(f'{name.ljust(7)}:{np.unique(array, return_counts=True)[1] / len(array)}')

target :[0.37 0.63]
y_train:[0.39 0.61]
y_test :[0.31 0.69]
