In [2]:
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from plotly import tools
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

%load_ext autoreload
%autoreload 2

In [3]:
# Initialize notebook mode to display plots using plotly library
init_notebook_mode(connected=True)

## Read data from csv file

In [4]:
columns = ["Sepal length", "Sepal width", "Petal length", "Petal width", "Class"]
dataframe = pd.read_csv("./iris_data.csv", names=columns)

dataframe.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Slice dataframe into 3 pieces

In [5]:
iris_setosa_df = dataframe[:50]
iris_versicoulour_df = dataframe[50:100]
iris_virginica_df = dataframe[100:]

## Data Visualisation

In [6]:
traces = []
colors = ["blue", "green", "yellow"]
for counter, col in enumerate(columns[:-1]):
    for i, color in zip(range(0, 150, 50), colors):
        traces.append(
            go.Scatter(
                x=np.linspace(1, 50),
                y=dataframe[i:i+50][col],
                legendgroup=dataframe.iloc[i]["Class"],
                name=dataframe.iloc[i]["Class"],
                mode="lines+markers",
                line=dict(
                    color=color
                ),
                showlegend=False
            )
        )

fig = tools.make_subplots(rows=2, cols=2, subplot_titles=columns[:-1])
coords = (
    (1, 1),
    (1, 2),
    (2, 1),
    (2, 2)
)
for cord, i in zip(coords, range(4)):
    for c in range(3):
        fig.append_trace(traces[i + c], cord[0], cord[1])
# dir(fig.layout)
fig['layout'].update(
    title='Data Comparision beetween Iris classes',
)
iplot(fig, filename='make-subplots-multiple-with-titles')


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



## Creating two datasets for training and testing

In [7]:
train_set = pd.concat([dataframe[i:i+50].sample(40) for i in range(0, 150, 50)])
test_set = pd.concat([dataframe, train_set]).drop_duplicates(keep=False)

## Creating labels for training and test datasets

In [9]:
train_labels, train_set = train_set[["Class"]], train_set[dataframe.columns[:4]]
test_labels, test_set = test_set[["Class"]], test_set[dataframe.columns[:4]]

## Conversion categorical data to numbers

In [10]:
train_labels["Class"] = pd.Categorical(train_labels["Class"])
train_labels["Class"] = train_labels["Class"].cat.codes

test_labels["Class"] = pd.Categorical(test_labels["Class"])
test_labels["Class"] = test_labels["Class"].cat.codes

## Training model using Linear Regression

In [11]:
model = LinearRegression()
model.fit(train_set, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

## Results

In [12]:
predict = model.predict(test_set)
print('Coefficients: ', model.coef_)
# The mean squared error
print("Mean squared error: {:.2f}".format(mean_squared_error(test_labels, predict)))
# Explained variance score: 1 is perfect prediction
print('Variance score: {:.2f}'.format(r2_score(test_labels, predict)))

Coefficients:  [[-0.10639424 -0.0371786   0.24744708  0.56630192]]
Mean squared error: 0.04
Variance score: 0.93
