# Part 1: Perceptron

## 1) Data set

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
data = load_breast_cancer()

In [None]:
print("positive class:", sum(data.target == 1))
print("negative class:", sum(data.target == 0))

```imbalanced classes```

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=22)

In [None]:
sm = SMOTE(random_state=22, sampling_strategy='minority')
X_tr_sm, y_tr_sm = sm.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
lr_sm = LogisticRegression(solver='liblinear').fit(X_tr_sm, y_tr_sm)

y_pred = lr.predict(X_test)
y_pr_sm = lr_sm.predict(X_test)

score = precision_recall_fscore_support(y_test, y_pred)
score_sm = precision_recall_fscore_support(y_test, y_pr_sm)

print("performance of Logistic Regression without oversampling", score)
print("performance of Logistic Regression with oversampling", score_sm)

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=None).fit(
    X_train, y_train)
rf_sm = RandomForestClassifier(n_estimators=100, n_jobs=None).fit(
    X_tr_sm, y_tr_sm)

y_pred = rf.predict(X_test)
y_pr_sm = rf_sm.predict(X_test)

score = precision_recall_fscore_support(y_test, y_pred)
score_sm = precision_recall_fscore_support(y_test, y_pr_sm)

print("performance of Random Forest without oversampling", score)
print("performance of Random Forest with oversampling", score_sm)

```conclusion: using SMOTE for oversampling will result in a minor improvement```

In [None]:
data.data = MinMaxScaler().fit_transform(data.data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=22)

In [None]:
sm = SMOTE(random_state=22, sampling_strategy='minority')
X_train, y_train = sm.fit_sample(X_train, y_train)

## 2) Building Model

In [None]:
# you can find implementation of perceptron in perceptron.py
import perceptron as pn

## 3) Run and Evaluate

In [None]:
perc = pn.Perceptron()
perc = perc.fit(X_train, y_train)
print(perc.errors[-1])

In [None]:
y_pred = perc.predict(X_test)
score = precision_recall_fscore_support(y_test, y_pred)
print("precision: ", score[0])
print("recall: ", score[1])
print("fscore: ", score[2])
acc = accuracy_score(y_test, y_pred)
print("accuracy: ", acc)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

## 4) Further optimization

### Using different train-test split factor

In [None]:
factors = [0.20, 0.25, 0.30, 0.35, 0.40]
accs = []
for f in factors:
    X_train_, X_test_, y_train_, y_test_ = train_test_split(
        data.data, data.target, test_size=f, random_state=22)
    X_train_, y_train_ = sm.fit_sample(X_train_, y_train_)
    perc = pn.Perceptron()
    perc = perc.fit(X_train_, y_train_)
    y_pred_ = perc.predict(X_test_)
    acc = accuracy_score(y_test_, y_pred_)
    accs.append(acc)

plt.plot(factors, accs)
plt.xlabel("train-test split factor")
plt.ylabel("accuracy")



```accuracy changes 5 percent at most```

### Using larger epochs size

In [None]:
epoch_sizes = [100, 200, 300, 400, 500]
accs = []

for e in epoch_sizes:
    perc = pn.Perceptron(epochs=e)
    perc = perc.fit(X_train, y_train)
    y_pred_ = perc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)

plt.plot(epoch_sizes, accs)
plt.xlabel("epochs")
plt.ylabel("accuracy")


### Using different learning rates

In [None]:
learning_rates = [0.001, 0.01, 0.1, 0.5]
accs = []

for lr in learning_rates:
    perc = pn.Perceptron(lr=lr)
    perc = perc.fit(X_train, y_train)
    y_pred_ = perc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)

plt.plot(learning_rates, accs)
plt.xlabel("learning rates")
plt.ylabel("accuracy")
