# Logistic Regression with Multi Labels(aka `multinomial`)
-----------------

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix

### Step 1: Load `csv file` 

In [3]:
import os

os.chdir("E:/code/6.MachineLearning/3_Logistic_Regression")

In [4]:
# Load csv file into DataFrame
df = pd.read_csv("iris.csv")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


### step2: preprocessing

### step3: identify X and y

In [6]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1]        #X=df.loc[:,df.columns!="species"] 

In [7]:
#import matplotlib.pyplot as plt

#for col in df.columns:
    #if col!=["species"]:
    #plt.scatter(X[col],y)
    #plt.show()

In [8]:
y.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [9]:
y.replace(['setosa', 'versicolor', 'virginica'],[0,1,2],inplace=True)

In [10]:
y.unique()

array([0, 1, 2], dtype=int64)

In [11]:
y = y.values

### Step 4: Split Data for training and testing

In [12]:
from sklearn.model_selection import train_test_split

seed=42
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.3, # train : 105 # test : 45
                                                    random_state=seed, # reproduce # seed
                                                    stratify=y) 
# input data ratio(50:50:50) = train data ratio(35:35:35) = test data ratio(15:15:15)

In [13]:
# Count of each class in orginal data
import numpy as np

unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{0: 50, 1: 50, 2: 50}

In [14]:
# Count of each class in train sample data
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 35, 1: 35, 2: 35}

In [15]:
# Count of each class in test sample data
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{0: 15, 1: 15, 2: 15}

### Step 5: Fit The Model

In [16]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(C=1e5, solver='lbfgs', multi_class='ovr')

In [17]:
log_model.fit(X_train, y_train)

LogisticRegression(C=100000.0, multi_class='ovr')

## Step 5: Predict labels of test data

In [18]:
y_test_pred = log_model.predict(X_test)
y_test_pred

array([2, 1, 1, 1, 2, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2, 1, 0, 0, 0, 1, 0,
       1, 2, 1, 1, 1, 1, 1, 0, 1, 2, 1, 0, 2, 0, 0, 0, 0, 1, 1, 0, 1, 2,
       1], dtype=int64)

In [19]:
y_test

array([2, 1, 2, 1, 2, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2, 1, 0, 0, 0, 1, 0,
       1, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, 1, 1, 0, 2, 2,
       1], dtype=int64)

In [20]:
# Count number of occurrences of each value in array of non-negative ints.
np.bincount(y_test_pred)

array([15, 19, 11], dtype=int64)

## Step 6: Accuracy

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report, roc_curve, roc_auc_score

accuracy_score(y_test,y_test_pred)

0.9111111111111111

> **or**

In [22]:
print(log_model.score(X_test, y_test))

0.9111111111111111


### Confusion Matrix

In [23]:
confusion_matrix(y_test,y_test_pred)

array([[15,  0,  0],
       [ 0, 15,  0],
       [ 0,  4, 11]], dtype=int64)

In [24]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.79      1.00      0.88        15
           2       1.00      0.73      0.85        15

    accuracy                           0.91        45
   macro avg       0.93      0.91      0.91        45
weighted avg       0.93      0.91      0.91        45



### roc curve