# Part 1: Perceptron

## 1) Data set

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

In [2]:
data = load_breast_cancer()

In [3]:
print("positive class:", sum(data.target == 1))
print("negative class:", sum(data.target == 0))

positive class: 357
negative class: 212


```imbalanced classes```

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=22)

In [5]:
sm = SMOTE(random_state=22, sampling_strategy='minority')
X_tr_sm, y_tr_sm = sm.fit_sample(X_train, y_train)

In [6]:
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
lr_sm = LogisticRegression(solver='liblinear').fit(X_tr_sm, y_tr_sm)

y_pred = lr.predict(X_test)
y_pr_sm = lr_sm.predict(X_test)

score = precision_recall_fscore_support(y_test, y_pred)
score_sm = precision_recall_fscore_support(y_test, y_pr_sm)

print("performance of Logistic Regression without oversampling", score)
print("performance of Logistic Regression with oversampling", score_sm)

performance of Logistic Regression without oversampling (array([0.97959184, 0.92553191]), array([0.87272727, 0.98863636]), array([0.92307692, 0.95604396]), array([55, 88]))
performance of Logistic Regression with oversampling (array([0.98      , 0.93548387]), array([0.89090909, 0.98863636]), array([0.93333333, 0.96132597]), array([55, 88]))


In [7]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=None).fit(
    X_train, y_train)
rf_sm = RandomForestClassifier(n_estimators=100, n_jobs=None).fit(
    X_tr_sm, y_tr_sm)

y_pred = rf.predict(X_test)
y_pr_sm = rf_sm.predict(X_test)

score = precision_recall_fscore_support(y_test, y_pred)
score_sm = precision_recall_fscore_support(y_test, y_pr_sm)

print("performance of Random Forest without oversampling", score)
print("performance of Random Forest with oversampling", score_sm)

performance of Random Forest without oversampling (array([0.98      , 0.93548387]), array([0.89090909, 0.98863636]), array([0.93333333, 0.96132597]), array([55, 88]))
performance of Random Forest with oversampling (array([0.98076923, 0.95604396]), array([0.92727273, 0.98863636]), array([0.95327103, 0.97206704]), array([55, 88]))


```conclusion: using SMOTE for oversampling will result in a minor improvement```

In [8]:
data.data = MinMaxScaler().fit_transform(data.data)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=22)

In [10]:
sm = SMOTE(random_state=22, sampling_strategy='minority')
X_train, y_train = sm.fit_sample(X_train, y_train)

## Building Model