In [106]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [107]:
data = pd.read_csv('task_b.csv')
data = data.iloc[:,1:]

print(data.shape)
data.head()

(200, 4)


Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [108]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [109]:
data[['f1', 'f2', 'f3']].std()

f1      488.195035
f2    10403.417325
f3        2.926662
dtype: float64

In [110]:
X = data[['f1','f2','f3']].values
Y = data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

<h3><font color='blue'> Make sure you write the observations for each task, why a particular feautre got more importance than others</font></h3>

## Task1

In [111]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# data[['f1']].plot(kind='density');
# data[['f2']].plot(kind='density');
# data[['f3']].plot(kind='density');

In [112]:
# pp = sns.pairplot(data, hue='y', height=2.5, aspect=1.6)
# pp.fig.suptitle("Pair Plot", fontsize=18, fontfamily= 'serif', );
# pp.fig.subplots_adjust(top=0.92);

#### Logistic Regression before standardization

In [113]:
from sklearn import linear_model
lr_clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, learning_rate='constant')
lr_clf.fit(X, Y)
coef = lr_clf.coef_[0]
print('Logistic Regression Weight vecotrs before standardization are:\nw1 = {}, w2 = {}, w3 = {}'.format(coef[0], coef[1], coef[2]))

Logistic Regression Weight vecotrs before standardization are:
w1 = 0.37170470951059437, w2 = -1.3446385278859918, w3 = 0.12669033333406268


#### SVM before standardization

In [114]:
svm_clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='hinge', random_state=15, penalty='l2', tol=1e-3, verbose=0, learning_rate='constant')
svm_clf.fit(X, Y)
coef = lr_clf.coef_[0]
print('SVM Weight vecotrs before standardization are:\nw1 = {}, w2 = {}, w3 = {}'.format(coef[0], coef[1], coef[2]))

SVM Weight vecotrs before standardization are:
w1 = 0.37170470951059437, w2 = -1.3446385278859918, w3 = 0.12669033333406268


## Observation
### feature importance :    f1 > f3 > f2
since correlation of input features (f1, f2, f3) with the output feature (y) is in the order f3 > f1 > f2, this order should also be the feature importance order, but the feature is not standardised, and hence the importance of feature is affected by the sacle of features, hence we are getting feature importance order as f1 > f3 > f2

## Task2

In [115]:
std_X = StandardScaler().fit_transform(X)
np.std(std_X, axis=0)

array([1., 1., 1.])

#### Logistic Regression after standardization

In [116]:
from sklearn import linear_model
lr_clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, learning_rate='constant')
lr_clf.fit(std_X, Y)
coef = lr_clf.coef_[0]
print('Logistic Regression Weight vecotrs after standardization are:\nw1 = {}, w2 = {}, w3 = {}'.format(coef[0], coef[1], coef[2]))

Logistic Regression Weight vecotrs after standardization are:
w1 = 0.038512415063121384, w2 = -0.005531222889550023, w3 = 0.889633220769868


#### SVM after standardization

In [117]:
svm_clf = linear_model.SGDClassifier(eta0=0.0001, alpha=0.0001, loss='hinge', random_state=15, penalty='l2', tol=1e-3, verbose=0, learning_rate='constant')
svm_clf.fit(std_X, Y)
coef = lr_clf.coef_[0]
print('SVM Weight vecotrs after standardization are:\nw1 = {}, w2 = {}, w3 = {}'.format(coef[0], coef[1], coef[2]))

SVM Weight vecotrs after standardization are:
w1 = 0.038512415063121384, w2 = -0.005531222889550023, w3 = 0.889633220769868


## Observation
### feature importance :    f3 > f1 > f2
after standardization, data points are in the same range, hence there will not be effect of scale of data on feature importance, hence feature having higher correlation with output featur will be more important. here feature f3 having highest correlation with the output feature, so its importance is high as compared to feature f1 and f2.