## scikit-learn: machine learning in Python (https://scikit-learn.org/stable/)

### Installing

Choose one of the following commands:

`conda install scikit-learn`

`pip install -U scikit-learn`


In [None]:
import sklearn

In [None]:
sklearn.__version__

### Initial imports

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

### Loading dataset

In [None]:
# From csv
csv_data = pd.read_csv('cal_housing.data')
csv_data.head()

In [None]:
# From sklearn
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.DESCR)

In [None]:
type(iris.data)

In [None]:
df = pd.DataFrame(iris.data)
df.head()

In [None]:
df.columns = iris.feature_names
df.head()

In [None]:
df['SPECIE'] = iris.target
df.head()

### Plotting data

In [None]:
# Checking pairwise relationships
sns.pairplot(df)

In [None]:
# Checking target variable distribution
sns.distplot((df['SPECIE']))

In [None]:
# Checking correlation
print(df.corr())
sns.heatmap(df.corr())

In [None]:
plt.rc("figure", figsize = (12, 8))
sns.heatmap(df.corr(), annot = True)

### Splitting dataset

In [None]:
from sklearn.model_selection import train_test_split

X1 = df.drop('SPECIE', axis = 1)
y1 = df['SPECIE']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state = 100)

X2 = df.drop('petal width (cm)', axis = 1)
y2 = df['petal width (cm)']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 100)

In [None]:
X2

### Supervised learning

#### Classification

In [None]:
from sklearn.svm import SVC
clf1 = SVC(kernel='poly', degree=2, gamma='auto', C=1.0)
clf1.fit(X1_train, y1_train)

In [None]:
y1_pred = clf1.predict(X1_test)
y1_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y1_pred, y1_test)

#### Regression

In [None]:
from sklearn.linear_model import LinearRegression
reg2 = LinearRegression()
reg2.fit(X2_train, y2_train)

In [None]:
y2_pred = reg2.predict(X2_test)
y2_pred

In [None]:
plot = sns.scatterplot(y2_pred, y2_test)
plot.set(xlim=(0, 2.5))
plot.set(ylim=(0, 2.5))

In [None]:
sns.distplot((y2_pred - y2_test), bins = 40)