In [109]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pre
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [110]:
np.random.seed(42)

In [111]:
fish_train_dataset = pd.read_csv("train_fish.csv")
fish_train_dataset.head()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species
0,90.0,16.3,17.7,19.8,7.4052,2.673,Parkki
1,290.0,24.0,26.0,29.2,8.8768,4.4968,Roach
2,850.0,36.9,40.0,42.3,11.9286,7.1064,Perch
3,300.0,34.8,37.3,39.8,6.2884,4.0198,Pike
4,265.0,25.4,27.5,28.9,7.0516,4.335,Perch


In [112]:
fish_test_dataset = pd.read_csv("test_fish.csv")
fish_test_dataset.head()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species
0,250.0,25.9,28.0,29.4,7.8204,4.2042,Perch
1,273.0,23.0,25.0,28.0,11.088,4.144,Parkki
2,13.4,11.7,12.4,13.5,2.43,1.269,Smelt
3,390.0,27.6,30.0,35.0,12.67,4.69,Bream
4,40.0,13.8,15.0,16.0,3.824,2.432,Perch


In [113]:
classes = ['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt']

In [114]:
def encoding_label(dataset):
    species = []
    
    for fish_type in dataset["Species"]:
        species.append(classes.index(fish_type))
    
    return pd.Series(species)

In [115]:
fish_train_dataset["Species"] = encoding_label(fish_train_dataset)
fish_test_dataset["Species"] = encoding_label(fish_test_dataset)

In [116]:
from sklearn.preprocessing import StandardScaler
 
X_train = fish_train_dataset.drop(["Species"], axis=1)
y_train = fish_train_dataset["Species"]
X_test = fish_test_dataset.drop(["Species"], axis=1)
y_test = fish_test_dataset["Species"]

In [117]:
scaler = StandardScaler()
transform = scaler.fit(X_train)
X_train = transform.transform(X_train)
X_test = transform.transform(X_test)

In [118]:
def softmax(Z):
    """
    Compute softmax values for each sets of scores in Z.
    each column of Z is a set of score.    
    """
    e_Z = np.exp(Z - np.max(Z, axis = 0, keepdims = True))
    A = e_Z / e_Z.sum(axis = 0)
    return A

In [119]:
def reshape_data(dataList):
    re = np.zeros((dataList.shape[1], dataList.shape[0]))
    
    for idx, data in enumerate(dataList):
        re[:, idx] = data
    
    return re.astype(np.float32)

In [120]:
# One hot coding 
def convert_to_one_hot(y, C):
    re = np.zeros((C, len(y)))
    
    for idx, label in enumerate(y):
        re[:, idx][label] = 1
    
    return re.astype(np.uint8)

In [121]:
def softmax_regression(X, y, W_init, eta, tol = 1e-4, max_count = 80000):
    W = [W_init]    
    C = W_init.shape[1]
    Y = convert_to_one_hot(y, C)
    it = 0
    N = X.shape[1]
    d = X.shape[0]
    
    count = 0
    check_w_after = 20
    while count < max_count:
        # mix data 
        mix_id = np.random.permutation(N)
        for i in mix_id:
            xi = X[:, i].reshape(d, 1)
            yi = Y[:, i].reshape(C, 1)
            ai = softmax(np.dot(W[-1].T, xi))
            W_new = W[-1] + eta*xi.dot((yi - ai).T)
            count += 1
            # stopping criteria
            if count%check_w_after == 0:                
                if np.linalg.norm(W_new - W[-check_w_after]) < tol:
                    return W
            W.append(W_new)
    return W

In [122]:
def pred(W, X):
    A = softmax(W.T.dot(X))
    return np.argmax(A, axis = 0)

In [123]:
C = 7

In [124]:
X_train = reshape_data(X_train)

In [125]:
eta = 0.5
d = X_train.shape[0]
W_init = np.random.randn(d, C)
W = softmax_regression(X_train, y_train, W_init, eta)
print(W[-1])

[[-9.29964203e+00 -6.93879485e+00 -1.29766350e+01  5.43935503e+00
   8.18783390e+00  3.29068572e+00  1.59372879e+01]
 [-4.76947186e+00  1.61796370e+00 -1.61912107e+01 -4.20567776e-02
   8.64167543e-01  1.13050213e+01  5.45564175e+00]
 [-3.56291422e+00 -3.58278384e+01  3.57877201e+00 -3.08800873e+00
   2.65904645e+01  9.51255709e+00 -1.04350035e+00]
 [ 1.66630792e+01  1.83595000e+01  4.05757149e+00 -1.44141840e+01
  -5.01212363e+01  1.15883086e+01  1.10752091e+01]
 [ 1.97759035e+01  3.15030361e+00  6.93128766e+00  2.04924467e+01
  -1.22704528e+01 -1.41997572e+01 -2.37701556e+01]
 [-5.25029012e+00  1.12543127e+01  1.55191486e+01 -9.91227964e+00
   1.67953769e+01 -1.90319228e+01 -1.25674858e+01]]


In [126]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [127]:
X = reshape_data(X_test)

In [128]:
test_pred = pred(W[-1], X)

In [129]:
#softmax regression
print(y_test)
print(test_pred)
print(classification_report(y_test, test_pred))

0     4
1     3
2     6
3     0
4     4
5     4
6     0
7     4
8     3
9     0
10    0
11    1
12    4
13    1
14    1
15    4
16    4
17    0
18    2
19    4
20    5
21    5
22    3
23    2
24    6
25    0
26    0
27    3
28    1
29    1
30    0
31    4
32    4
33    4
34    1
35    1
36    0
37    0
38    4
39    0
40    4
41    1
42    4
43    4
44    1
45    0
46    4
47    4
Name: Species, dtype: int64
[4 3 6 0 4 4 0 4 3 0 0 1 0 2 2 4 4 0 4 4 5 5 3 6 4 0 0 3 1 1 0 4 4 4 1 1 0
 0 4 0 4 1 4 4 4 0 4 4]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       1.00      0.67      0.80         9
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         4
           4       0.84      0.94      0.89        17
           5       1.00      1.00      1.00         2
           6       0.50      0.50      0.50         2

    accuracy                           0.85        48
 