# Подготовка данных

In [1]:
from sklearn import datasets

In [2]:
iris_data = datasets.load_iris()

In [3]:
for key in iris_data.keys():
    print( key )

data
target
target_names
DESCR
feature_names
filename


In [4]:
iris_data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
iris_data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:
import numpy as np
import pandas as pd

In [7]:
df_iris = pd.DataFrame(iris_data.data,columns=iris_data.feature_names)
df_iris['target'] = pd.Series(iris_data.target)
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
data_prep=df_iris.loc[(df_iris['target'] == 1) | (df_iris['target'] == 2)]

In [9]:
# т.к. исходный результат - это вероятность принадлежности к классу, то должны быть значения 0 и 1 
# (принадлежит к 1 классу и не принадлежит ко второму)
data_prep.target = data_prep.target.replace({2: 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [10]:
data_prep

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1
55,5.7,2.8,4.5,1.3,1
56,6.3,3.3,4.7,1.6,1
57,4.9,2.4,3.3,1.0,1
58,6.6,2.9,4.6,1.3,1
59,5.2,2.7,3.9,1.4,1


In [11]:
Xtrain_prep=data_prep.drop("target", axis=1)
Ytrain_prep=data_prep["target"]

In [12]:
# Поделим на train и тест (чтобы потом проверить ответы)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xtrain_prep, Ytrain_prep, test_size=0.2)

In [13]:
# т.к. метод - метрический, произведем масштабирование данных

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_lin=scaler.transform(X_train)
X_test_lin=scaler.transform(X_test)

# Проверю, какие результаты выдаст метод из пакета

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
model = LogisticRegression()

In [16]:
model.fit(X_train_lin, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
model.coef_

array([[ 0.15705382,  0.54034212, -1.89083537, -2.31490459]])

In [18]:
model.intercept_

array([-0.11290947])

In [19]:
from sklearn.model_selection import cross_val_score
scores_train = cross_val_score(model, X_train, y_train, cv=10)
scores_train



array([1.   , 1.   , 1.   , 1.   , 0.875, 0.875, 1.   , 1.   , 0.875,
       1.   ])

In [20]:
model.score(X_test_lin, y_test)

1.0

# А теперь программирую модель регрессии

In [21]:
#сигмоид
def sigmoid(X, theta):
    z = np.dot(X, theta[1:]) + theta[0]
    sigm = 1/( 1 + np.exp(-z))
    return sigm

In [22]:
# Градиентный спуск
def gradientDescent(X, y, theta, alpha, num_iter):
    for i in range(num_iter):
        h = sigmoid(X, theta)
        difference = h - y
        gradient = X.T.dot(difference)
        theta[0] = theta[0] - alpha*difference.sum()
        theta[1:] = theta[1:] - alpha*gradient
    return theta   

In [23]:
m, n = X_train_lin.shape

theta = np.zeros(1+n)

alpha = 0.0001
num_iter = 10000

final_theta = gradientDescent(X_train_lin, y_train, theta, alpha, num_iter)

In [24]:
final_theta

array([-0.18018753,  0.28585365,  0.83326302, -2.52887779, -3.20033577])

In [25]:
#предсказания

def predict_probs(X, theta):
    return sigmoid(X, theta)

def predict(X, theta, threshold=0.5):
    y_pred=[]
    y=predict_probs(X, theta) >= threshold
    for i in range(X.shape[0]):
        if y[i]==True:
            y_pred.append(1)
        else:
            y_pred.append(0)
        
    return y_pred

In [26]:
y_pred=predict(X_test_lin, theta)

In [27]:
#Чтобы посмотреть, не ерунду ли я сделала, решила определить долю верных ответов
def pred_accuracy(y_test, y_pred):
    ind=0
    right_answers=0
    for i, y in enumerate(y_test):
        if y_pred[ind]==y:
            right_answers+=1
        ind+=1
    return right_answers/y_test.shape[0]

In [28]:
#на тестовой выборке
pred_accuracy(y_test, y_pred)

1.0

In [29]:
#какой процент правильных ответов был при обучении
y_pred_train=predict(X_train_lin, theta)
pred_accuracy(y_train, y_pred_train)

0.95