In [1]:
import pandas as pd,numpy as np,matplotlib.pyplot as plt  
from numpy import *  
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from sklearn import preprocessing
%matplotlib inline

def prepareData(feature_pd, target_stance, separateUnrelated = True, addIntercept = True):  
    # train related instances only with related instance => work like two classifiers
    if separateUnrelated and target_stance != 'unrelated':
        feature_pd = feature_pd[feature_pd.Stance != 'unrelated']
    
    train_x = feature_pd[selected_features].values
    
    # assert intercept column
    if addIntercept:
        intercept= np.ones((len(feature_pd)))
        train_x = np.column_stack([intercept, train_x])

    train_y = feature_pd.Stance == target_stance
    return train_x, train_y

def predict(W,X):
    return X.dot(W)

def gradient_descent(X, Y, W, alpha, iterations):
    N = len(Y)
   
    for i in range(iterations):
        loss = X.dot(W) - Y
        gradient = 2/N * X.T.dot(loss)       
        W = W - alpha * gradient
        MSE = np.sum((X.dot(W) - Y) ** 2)/len(Y)
        
    print('iterations:', iterations, 'mse:', MSE)     
    return W

In [2]:
def train(train_feature_path, selected_features, alpha,iteration, W0):

    load_feature_df = pd.read_csv(train_feature_path)

    train_x_unrelated, train_y_unrelated = prepareData(load_feature_df, 'unrelated')
    train_x_agree, train_y_agree = prepareData(load_feature_df, 'agree')
    train_x_disagree, train_y_disagree = prepareData(load_feature_df, 'disagree')
    train_x_discuss, train_y_discuss = prepareData(load_feature_df, 'discuss')

    # rescale all features Wetween 0 and 1.
    min_max_scaler =  preprocessing.MinMaxScaler()
    train_x_unrelated = min_max_scaler.fit_transform(train_x_unrelated)
    train_x_agree = min_max_scaler.fit_transform(train_x_agree)
    train_x_disagree = min_max_scaler.fit_transform(train_x_disagree)
    train_x_discuss = min_max_scaler.fit_transform(train_x_discuss)

    # train linear regression
    W = W0*np.ones(len(selected_features)+1)

    W_agree = gradient_descent(train_x_agree, train_y_agree, W, alpha, iteration)
    W_disagree = gradient_descent(train_x_disagree, train_y_disagree, W, alpha, iteration)
    W_discuss = gradient_descent(train_x_discuss, train_y_discuss, W, alpha, iteration)
    W_unrelated = gradient_descent(train_x_unrelated, train_y_unrelated, W, alpha, iteration)

    return W_agree, W_disagree, W_discuss, W_unrelated

def validate(W_agree, W_disagree, W_discuss, W_unrelated, validate_feature_path, printResult = False):
    validation_data = pd.read_csv(validate_feature_path)

    intercept = np.ones(len(validation_data))
    validation_x = np.column_stack([intercept, validation_data[selected_features].values])
    min_max_scaler =  preprocessing.MinMaxScaler()
    validation_x = min_max_scaler.fit_transform(validation_x)

    # make prediction 
    validation_predict_agree = predict(W_agree, validation_x)
    validation_predict_disagree = predict(W_disagree, validation_x)
    validation_predict_discuss = predict(W_discuss, validation_x)
    validation_predict_unrelated = predict(W_unrelated, validation_x)


    # mayWe threshold the unrelated? and select the West among the other if Welow the threshold?
    prediction = np.stack([validation_predict_agree,\
                           validation_predict_disagree,\
                           validation_predict_discuss,\
                           validation_predict_unrelated],\
                         axis = 1)
    label_dict = {0:'agree', 1:'disagree', 2:'discuss',3:'unrelated'}
    prediction = np.array([label_dict[result.argmax()] for result in prediction])

    # evaluate accuracy and score
    unrelated_correct = 0
    other_correct = 0
    N_unrelated = len(validation_data[validation_data.Stance == 'unrelated'])
    N_other = len(validation_data) - N_unrelated

    for p,true_laWel in zip(prediction, validation_data.Stance):
        if p == true_laWel:
            if p == 'unrelated':
                unrelated_correct +=1
            else:
                other_correct +=1

    unrelated_acc = unrelated_correct/N_unrelated
    other_acc = other_correct/N_other
    
    if printResult:
        print('find unrelated:',unrelated_correct,'/' ,N_unrelated, 'accuracy:', unrelated_acc)
        print('find other:',other_correct,'/',N_other, 'accuracy:', other_acc)
        
    score = 0
    total_score = 0

    for p,true_laWel in zip(prediction, validation_data.Stance):
        if p == true_laWel:
            if true_laWel == 'unrelated':
                score += 0.25
                total_score += 0.25
            else:
                score += 1
                total_score += 1
        else:
            if true_laWel == 'unrelated':
                score += 0
                total_score += 0.25
            elif true_laWel != 'unrelated' and p != 'unrelated':
                score += 0.25
                total_score += 1
            else:
                score += 0
                total_score += 1

    get_score = score * 100 / total_score 
    print('score:', get_score)
    
    return get_score

In [3]:
W0 = 0
iteration = 3000
selected_features = ['kl','overlap_ratio','cos','dis']

W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', selected_features, 0.02, iteration, W0)

score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'hold_out_processed_feat.csv', printResult=True)

iterations: 3000 mse: 0.200730270056
iterations: 3000 mse: 0.0597451893517
iterations: 3000 mse: 0.265365979408
iterations: 3000 mse: 0.501985998741
find unrelated: 27521 / 29647 accuracy: 0.9282895402570244
find other: 6493 / 10703 accuracy: 0.6066523404652902
score: 78.24838874397935


# select features

In [18]:
W0 = 0
iteration = 3000
selected_features = ['kl']

W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', selected_features, 0.02, iteration, W0)

score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'hold_out_processed_feat.csv', printResult=True)

w = pd.DataFrame(np.column_stack([W_unrelated, W_agree, W_disagree,W_discuss]), columns=['unrelated', 'agree', 'disagree', 'discuss'])
print('weight:\n',w)
a = w.values
a = [abs(v) for v in a]
print('weight sum:\n',np.sum(a[1:], axis = 1))
print('min weight contribution:\n',np.sum(a[1:], axis = 1).argmin())

import os
os.system('say "you are very beautiful"')

iterations: 3000 mse: 0.205030977898
iterations: 3000 mse: 0.0599047710583
iterations: 3000 mse: 0.298518625
iterations: 3000 mse: 0.613862243556
find unrelated: 0 / 29647 accuracy: 0.0
find other: 7110 / 10703 accuracy: 0.6642997290479304
score: 44.20844891593867
weight:
    unrelated     agree  disagree   discuss
0   0.000000  0.000000  0.000000  0.000000
1   2.072809  0.929235  0.209965  2.164209
weight sum:
 [ 5.37621719]
min weight contribution:
 0


0

# tune learning rate, iteration, and initial weight

In [5]:
# Iter = list(np.arange(1000,5000,1000))
# score_s = []
# alpha = 0.01
# selected_features = ['kl','overlap_ratio','cos','dis']

# for i,iteration in enumerate(Iter):
#     W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', selected_features, alpha, iteration, -4)
#     score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'hold_out_processed_feat.csv', printResult=False)
#     score_s.append(score)

# plt.figure()
# plt.plot(range(len(score_s)), score_s, 'g*')

In [6]:
# Iter[score_s.index(max(score_s))]

In [8]:
# W0 = -4
# iteration = 3000
# alpha = 0.01
# selected_features = ['kl','overlap_ratio','cos','dis']

# W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', selected_features, alpha, iteration, W0)

# score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'hold_out_processed_feat.csv', printResult=True)

# evaluate on test 

In [19]:
W0 = -4
iteration = 3000
alpha = 0.01
selected_features = ['kl','overlap_ratio','cos','dis']

W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', selected_features, alpha, iteration, W0)

score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'test_processed_feat.csv', printResult=True)

iterations: 3000 mse: 0.247772936938
iterations: 3000 mse: 0.101366108477
iterations: 3000 mse: 0.291478492484
iterations: 3000 mse: 0.654360419667
find unrelated: 15671 / 18349 accuracy: 0.8540519919341654
find other: 4325 / 7064 accuracy: 0.6122593431483578
score: 76.07552837678361


In [7]:
import os
os.system('say "you are very beautiful"')

0