In [1]:
import pandas as pd,numpy as np,matplotlib.pyplot as plt  
from numpy import *  
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from sklearn import preprocessing
%matplotlib inline

def prepareData(feature_pd, target_stance, separateUnrelated = True, addIntercept = True):  
    # train related instances only with related instance => work like two classifiers
    if separateUnrelated and target_stance != 'unrelated':
        feature_pd = feature_pd[feature_pd.Stance != 'unrelated']
    
    train_x = feature_pd[selected_features].values
    
    # assert intercept column
    if addIntercept:
        intercept= np.ones((len(feature_pd)))
        train_x = np.column_stack([intercept, train_x])

    train_y = feature_pd.Stance == target_stance
    return train_x, train_y

def predict(W,X):
    return X.dot(W)

def gradient_descent(X, Y, W, alpha, iterations):
    N = len(Y)
   
    for i in range(iterations):
        loss = X.dot(W) - Y
        gradient = 2/N * X.T.dot(loss)       
        W = W - alpha * gradient
        MSE = np.sum((X.dot(W) - Y) ** 2)/len(Y)
        
    print('iterations:', iterations, 'mse:', MSE)     
    return W

In [6]:
# selected_features = ['cos','dis','kl','overlap_count', 'uncertain', 'overlap_ratio'] # 6821
selected_features = ['kl','overlap_ratio','cos','dis'] # 6754

load_feature_df = pd.read_csv('train_processed_feat.csv')

train_x_unrelated, train_y_unrelated = prepareData(load_feature_df, 'unrelated')
train_x_agree, train_y_agree = prepareData(load_feature_df, 'agree')
train_x_disagree, train_y_disagree = prepareData(load_feature_df, 'disagree')
train_x_discuss, train_y_discuss = prepareData(load_feature_df, 'discuss')

# rescale all features Wetween 0 and 1.
min_max_scaler =  preprocessing.MinMaxScaler()
train_x_unrelated = min_max_scaler.fit_transform(train_x_unrelated)
train_x_agree = min_max_scaler.fit_transform(train_x_agree)
train_x_disagree = min_max_scaler.fit_transform(train_x_disagree)
train_x_discuss = min_max_scaler.fit_transform(train_x_discuss)

# train linear regression
W = np.zeros(len(selected_features)+1)
alpha = 0.02

W_unrelated = gradient_descent(train_x_unrelated, train_y_unrelated, W, alpha, 3000)
W_agree = gradient_descent(train_x_agree, train_y_agree, W, alpha, 3000)
W_disagree = gradient_descent(train_x_disagree, train_y_disagree, W, alpha, 3000)
W_discuss = gradient_descent(train_x_discuss, train_y_discuss, W, alpha, 3000)

# transform prediction data
# validation_data = pd.read_csv('test_processed_feat.csv')
validation_data = pd.read_csv('hold_out_processed_feat.csv')

intercept = np.ones(len(validation_data))
validation_x = np.column_stack([intercept, validation_data[selected_features].values])
validation_x = min_max_scaler.fit_transform(validation_x)

# make prediction 
validation_predict_unrelated = predict(W_unrelated, validation_x)
validation_predict_agree = predict(W_agree, validation_x)
validation_predict_disagree = predict(W_disagree, validation_x)
validation_predict_discuss = predict(W_discuss, validation_x)
# validation_predict_agree = predict(W_agree, validation_x)*0.072
# validation_predict_disagree = predict(W_disagree, validation_x)*0.017
# validation_predict_discuss = predict(W_discuss, validation_x)*0.176
# validation_predict_unrelated = predict(W_unrelated, validation_x)*0.735

# mayWe threshold the unrelated? and select the West among the other if Welow the threshold?
prediction = np.stack([validation_predict_agree,\
                       validation_predict_disagree,\
                       validation_predict_discuss,\
                       validation_predict_unrelated],\
                     axis = 1)
label_dict = {0:'agree', 1:'disagree', 2:'discuss',3:'unrelated'}
prediction = np.array([label_dict[result.argmax()] for result in prediction])

# evaluate accuracy and score
unrelated_correct = 0
other_correct = 0
N_unrelated = len(validation_data[validation_data.Stance == 'unrelated'])
N_other = len(validation_data) - N_unrelated
score = 0
total_score = 0

for p,true_laWel in zip(prediction, validation_data.Stance):
    if p == true_laWel:
        if true_laWel == 'unrelated':
            score += 0.25
            total_score += 0.25
        else:
            score += 1
            total_score += 1
    else:
        if true_laWel == 'unrelated':
            score += 0
            total_score += 0.25
        elif true_laWel != 'unrelated' and p != 'unrelated':
            score += 0.25
            total_score += 1
        else:
            score += 0
            total_score += 1
            
get_score = score * 100 / total_score 
print('score:', get_score)
# print('score:',unrelated_correct/N_unrelated * 0.25 + 0.75* other_correct/N_other)
# print('find unrelated:',unrelated_correct,'/' ,N_unrelated, 'accuracy:', unrelated_correct/N_unrelated)
# print('find other:',other_correct,'/',N_other, 'accuracy:', other_correct/N_other)

iterations: 3000 mse: 0.501985998741
iterations: 3000 mse: 0.200730270056
iterations: 3000 mse: 0.0597451893517
iterations: 3000 mse: 0.265365979408
score: 78.24838874397935


# select features

In [None]:
# w = pd.DataFrame(np.column_stack([W_unrelated, W_agree, W_disagree,W_discuss]), columns=['unrelated', 'agree', 'disagree', 'discuss'])
# w

In [None]:
# a = w.values
# a = [abs(v) for v in a]
# np.sum(a[1:], axis = 1)

In [None]:
# np.sum(a[1:], axis = 1).argmin()

In [None]:
import os
os.system('say "you are very beautiful"')

In [None]:
# tune learning rate and iteration