In [1]:
import pandas as pd
import random
import numpy as np
from scipy.io.arff import loadarff 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import fbeta_score

In [2]:
raw_data = loadarff("./data/5year.arff")
year = pd.DataFrame(raw_data[0])

FileNotFoundError: [Errno 2] No such file or directory: './data/5year.arff'

In [None]:
year['class'] = year['class'].apply(lambda x: 0 if x == 
                                    year['class'][0] else 1)
year = year.dropna()

## Question 1

In [None]:
question_1 = LogisticRegression(C = 1.0)
X_train = year.drop(columns='class')
y_train = year['class']
question_1.fit(X_train,y_train)
y_predict = question_1.predict(X_train)
print('Accuracy:', accuracy_score(y_train, y_predict))
print("Balanced Error Rate (BER):", 
      1-balanced_accuracy_score(y_train, y_predict))

### Answer
* Accuracy: 0.9663477400197954
* Balanced Error Rate (BER): 0.4810749837661251

## Question 2

In [None]:
question_2 = LogisticRegression(class_weight="balanced",C=1.0)
X_train = year.drop(columns='class')
y_train = year['class']
question_2.fit(X_train,y_train)
y_predict = question_2.predict(X_train)
print('Accuracy:', accuracy_score(y_train, y_predict))
print("Balanced Error Rate (BER):", 
      1-balanced_accuracy_score(y_train, y_predict))

### Answer
* Accuracy: 0.7825800065984824
* Balanced Error Rate (BER): 0.20712081350122835

## Question 3

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    year.drop(columns='class'), year['class'], test_size=0.5)
X_test, X_vali, y_test, y_vali = train_test_split(
                                X_test, y_test, test_size=0.5)
question_3 = LogisticRegression(class_weight="balanced")
question_3.fit(X_train,y_train)

print('Training data set: ')
y_predict_on_train = question_3.predict(X_train)
print('Accuracy:', accuracy_score(y_train, y_predict_on_train))
print("Balanced Error Rate (BER):", 1-balanced_accuracy_score(y_train,
                                                    y_predict_on_train))
print('-----------------------------------------------------------')

print('Test data set: ')
y_predict_on_test = question_3.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_predict_on_test))
print("Balanced Error Rate (BER):", 1-balanced_accuracy_score(y_test, 
                                                y_predict_on_test))
print('-----------------------------------------------------------')

print('Validation data set: ')
y_predict_on_vali = question_3.predict(X_vali)
print('Accuracy:', accuracy_score(y_vali, y_predict_on_vali))
print("Balanced Error Rate (BER):", 1-balanced_accuracy_score(y_vali, 
                                                y_predict_on_vali))

### Answer
* Training data set: 
    * Accuracy: 0.7676567656765677
    * Balanced Error Rate (BER): 0.2640062089379829

* Test data set: 
    * Accuracy: 0.7559366754617414
    * Balanced Error Rate (BER): 0.31184321143337534
* Validation data set: 
    * Accuracy: 0.7651715039577837
    * Balanced Error Rate (BER): 0.2999074140018517

## Question 4

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    year.drop(columns='class'), year['class'], test_size=0.5)
X_test, X_vali, y_test, y_vali = train_test_split(
    X_test, y_test, test_size=0.5)
report = []
for i in range(-4,5):
    question_4 = LogisticRegression(class_weight="balanced",C=10**i)
    question_4.fit(X_train,y_train)
    y_predict_on_train = question_4.predict(X_train)
    y_predict_on_test = question_4.predict(X_test)
    y_predict_on_vali = question_4.predict(X_vali)
    BEC_train = 1 - balanced_accuracy_score(y_train, y_predict_on_train)
    BEC_test = 1 - balanced_accuracy_score(y_test, y_predict_on_test)
    BEC_vali = 1 - balanced_accuracy_score(y_vali, y_predict_on_vali)
    report.append([BEC_train,BEC_test,BEC_vali])
report = pd.DataFrame(report,index=np.arange(-4,5)).T
report.index=['train','test','vali']

In [None]:
report

### Answer
* I would choose C = 10^2 = 100 as the C. BER means that the average of the proportion of wrong classifications in each class. So, the lower the BER score, the better the model is. In the above table, when C = 100, the validation data set has the lowest balanced error rate. Therefore, in training and testing data sets, they have lowest BER rate among all other C options. 

## Question 5

In [None]:
def f_score(beta,y_true,y_predict):
    # calculate TP, TN, FP, FN
    def matrix(true_val,pred_val):
        true_val = np.array(true_val)
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for i in range(len(true_val)):
            if true_val[i]== pred_val[i]==1:
                TP += 1
            if true_val[i]==pred_val[i]==0:
                TN += 1
            if true_val[i] == 1 and pred_val[i]!= true_val[i]:
                FN += 1
            if true_val[i] == 0 and pred_val[i]!=true_val[i]:
                FP += 1
        return (TP, TN, FP, FN)
    TP, TN, FP, FN = matrix(y_true,y_predict)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    return ((1 + beta**2) * (precision * recall)) / ((beta**2) * precision + recall)              

### Answer
* Fβ scores for β = 0.1 : 0.6362204724409449
* Fβ scores for β = 1 : 0.19512195121951217
* Fβ scores for β = 10 : 0.1152310325156874

## Question 7

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    year.drop(columns='class'),year['class'], test_size=0.5)
X_test, X_vali, y_test, y_vali = train_test_split(
    X_test, y_test, test_size=0.5)

question_7 = PCA(n_components = X_train.shape[1])
question_7.fit(X_train)
print(question_7.components_[0])

## Question 8

In [None]:
report = []
for i in range(5,31,5):
    Xpca_train = np.matmul(np.array(X_train),
                           question_7.components_[:,:i])
    Xpca_valid = np.matmul(np.array(X_vali),
                           question_7.components_[:,:i])
    Xpca_test = np.matmul(np.array(X_test),
                          question_7.components_[:,:i])
    logistic_q8 = LogisticRegression(class_weight="balanced",C=1)
    logistic_q8.fit(Xpca_train,y_train)
    y_predict_on_test = logistic_q8.predict(Xpca_test)
    y_predict_on_vali = logistic_q8.predict(Xpca_valid)
    BEC_test = 1 - balanced_accuracy_score(y_test, y_predict_on_test)
    BEC_vali = 1 - balanced_accuracy_score(y_vali, y_predict_on_vali)
    report.append([BEC_test,BEC_vali])
report = pd.DataFrame(report,index=np.arange(5,31,5)).T
report.index=['test','vali']

In [None]:
report