In [50]:
import pandas as pd
import numpy as np

In [51]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [52]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [53]:
def _print(statement, arguments, do_print = True):
    
    if do_print:
        print(statement.format(*arguments))
    
    return

In [54]:
def print_lb(character, num = 60):

    print(character*num)
    
    return

In [55]:
data = pd.read_csv('task_b.csv')
_print("Top {} rows of DataFrame : \n\n{}", [3, data.head(n = 3)])

Top 3 rows of DataFrame : 

   index           f1            f2        f3    y
0      0  -195.871045 -14843.084171  5.532140  1.0
1      1 -1217.183964  -4068.124621  4.416082  1.0
2      2     9.138451   4413.412028  0.425317  0.0


In [56]:
data_corr = data.corr()
print(data_corr)

          index        f1        f2        f3         y
index  1.000000  0.178685  0.149585 -0.017404 -0.014203
f1     0.178685  1.000000  0.065468  0.123589  0.067172
f2     0.149585  0.065468  1.000000 -0.055561 -0.017944
f3    -0.017404  0.123589 -0.055561  1.000000  0.839060
y     -0.014203  0.067172 -0.017944  0.839060  1.000000


In [57]:
data_std = data.std()
print(data_std)

index       57.879185
f1         488.195035
f2       10403.417325
f3           2.926662
y            0.501255
dtype: float64


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

In [58]:
"""
lr_clsfr: Logistic Rgression classifier
"""
lr_clsfr = linear_model.SGDClassifier(loss='log', eta0 = 0.0001, alpha = 0.0001, penalty = 'l2', random_state = 15, verbose = 2, n_jobs = -1 )

In [59]:
d_train = data[['f1', 'f2', 'f3']].values
_print("1. Data Type of d_train: {}", [type(d_train)])
_print("2. Shape of d_train: {}\n\n", [d_train.shape])

y = data[['y']].values
_print("1. Data Type of y: {}", [type(y)])
_print("2. Shape of y: ", [y.shape])

1. Data Type of d_train: <class 'numpy.ndarray'>
2. Shape of d_train: (200, 3)


1. Data Type of y: <class 'numpy.ndarray'>
2. Shape of y: 


In [60]:
"""
Let's reshape y dimension from 2-D array to vector
"""
y = y.reshape(-1)

In [61]:
lr_clsfr.fit(d_train, y)

-- Epoch 1
Norm: 89870.22, NNZs: 3, Bias: -155.051327, T: 200, Avg. loss: 227510618.879137
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 43791.04, NNZs: 3, Bias: -144.898405, T: 400, Avg. loss: 200670830.244106
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 21826.51, NNZs: 3, Bias: -167.580253, T: 600, Avg. loss: 220221238.438343
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 44052.83, NNZs: 3, Bias: -220.577819, T: 800, Avg. loss: 179903697.744537
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 43829.42, NNZs: 3, Bias: -246.905532, T: 1000, Avg. loss: 159178154.202827
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 45138.87, NNZs: 3, Bias: -208.546352, T: 1200, Avg. loss: 142129650.458868
Total training time: 0.00 seconds.
-- Epoch 7
Norm: 27699.02, NNZs: 3, Bias: -168.679560, T: 1400, Avg. loss: 134776289.480256
Total training time: 0.00 seconds.
-- Epoch 8
Norm: 30003.76, NNZs: 3, Bias: -147.095138, T: 1600, Avg. loss: 122695625.336222
Total training time: 0.

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=-1,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [62]:
lr_clsfr.coef_

array([[  3925.14601273, -16033.05764291,  10502.94022174]])

In [64]:
lr_clsfr.intercept_

array([-239.12473731])

In [65]:
"""
svm_clsfr: Support Vector Machine classifier
"""
svm_clsfr = linear_model.SGDClassifier(loss='hinge', eta0 = 0.0001, alpha = 0.0001, penalty = 'l2', random_state = 15, verbose = 2, n_jobs = -1 )

In [66]:
svm_clsfr.fit(d_train, y)

-- Epoch 1
Norm: 50970.78, NNZs: 3, Bias: -139.425296, T: 200, Avg. loss: 237646740.055412
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 48251.18, NNZs: 3, Bias: -107.540533, T: 400, Avg. loss: 198997629.606033
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 47283.82, NNZs: 3, Bias: -98.147510, T: 600, Avg. loss: 221678301.804550
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 42763.27, NNZs: 3, Bias: -203.499872, T: 800, Avg. loss: 184224165.832094
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 46523.86, NNZs: 3, Bias: -208.473184, T: 1000, Avg. loss: 163252412.932515
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 19449.28, NNZs: 3, Bias: -203.212119, T: 1200, Avg. loss: 144245644.752882
Total training time: 0.00 seconds.
-- Epoch 7
Norm: 28910.00, NNZs: 3, Bias: -180.696759, T: 1400, Avg. loss: 130092109.903065
Total training time: 0.01 seconds.
-- Epoch 8
Norm: 34971.40, NNZs: 3, Bias: -163.150693, T: 1600, Avg. loss: 121277810.479388
Total training time: 0.0

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
              loss='hinge', max_iter=1000, n_iter_no_change=5, n_jobs=-1,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [67]:
svm_clsfr.coef_

array([[-1441.65036452, -3083.88512888, 10638.5348658 ]])

In [68]:
svm_clsfr.intercept_

array([-213.74703893])