In [1]:
import pandas as pd,numpy as np,matplotlib.pyplot as plt, time  
from numpy import *  
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from sklearn import preprocessing
%matplotlib inline

def sigmoid(x):  
    return 1.0 / (1 + np.exp(-x))  

def prepareData(train_pd, stance):  
    train_x = train_pd[selected_features].values
    train_y = train_pd['Stance'].values == stance
    return mat(train_x), mat(train_y).transpose()

def predict(weights, test_x):
    predict = sigmoid(test_x * weights)
    return predict

def gradient_descent(train_x, train_y, weights, alpha, iteration):  
    for k in range(iteration):  
        output = sigmoid(np.dot(train_x, weights))  
        loss = train_y - output
        weights = weights + alpha * train_x.transpose() * loss 
    
    mse = (sum(loss))**2/len(train_x)
    print('mse:',mse)

    return weights

In [11]:
def train(train_feat_path,alpha,iteration, W0):
    load_feature_df = pd.read_csv(train_feat_path)

    train_x_unrelated, train_y_unrelated = prepareData(load_feature_df, 'unrelated')
    train_x_agree, train_y_agree = prepareData(load_feature_df, 'agree')
    train_x_disagree, train_y_disagree = prepareData(load_feature_df, 'disagree')
    train_x_discuss, train_y_discuss = prepareData(load_feature_df, 'discuss')

    # train logistic regression
    W = W0*np.ones((len(selected_features),1))
    print('start train using logistic regression:')
    W_agree = gradient_descent(train_x_agree, train_y_agree, W, alpha, iteration)
    W_disagree = gradient_descent(train_x_disagree, train_y_disagree, W, alpha, iteration)
    W_discuss = gradient_descent(train_x_discuss, train_y_discuss, W, alpha, iteration)
    W_unrelated = gradient_descent(train_x_unrelated, train_y_unrelated, W, alpha, iteration)
    
    return W_agree, W_disagree, W_discuss, W_unrelated

def validate(W_agree, W_disagree, W_discuss, W_unrelated, validate_feat_path, printResult = False):
    # predict on validation 
    validation_data = pd.read_csv(validate_feat_path)
    validation_x = validation_data[selected_features].values

    validation_predict_agree = predict(W_agree, validation_x)*0.072
    validation_predict_disagree = predict(W_disagree, validation_x)*0.017
    validation_predict_discuss = predict(W_discuss, validation_x)*0.176
    validation_predict_unrelated = predict(W_unrelated, validation_x)*0.735

    prediction = np.column_stack([validation_predict_agree,\
                                  validation_predict_disagree,\
                                  validation_predict_discuss,\
                                  validation_predict_unrelated])
    label_dict = {0:'agree', 1:'disagree', 2:'discuss',3:'unrelated'}
    prediction = np.array([label_dict[result.argmax()] for result in prediction])

    # evaluate accuracy and score
    unrelated_correct = 0
    other_correct = 0
    N_unrelated = len(validation_data[validation_data.Stance == 'unrelated'])
    N_other = len(validation_data) - N_unrelated

    for p,true_laWel in zip(prediction, validation_data.Stance):
        if p == true_laWel:
            if p == 'unrelated':
                unrelated_correct +=1
            else:
                other_correct +=1

    unrelated_acc = unrelated_correct/N_unrelated
    other_acc = other_correct/N_other
    
    if printResult:
        print('find unrelated:',unrelated_correct,'/' ,N_unrelated, 'accuracy:', unrelated_acc)
        print('find other:',other_correct,'/',N_other, 'accuracy:', other_acc)
        
    score = 0
    total_score = 0

    for p,true_laWel in zip(prediction, validation_data.Stance):
        if p == true_laWel:
            if true_laWel == 'unrelated':
                score += 0.25
                total_score += 0.25
            else:
                score += 1
                total_score += 1
        else:
            if true_laWel == 'unrelated':
                score += 0
                total_score += 0.25
            elif true_laWel != 'unrelated' and p != 'unrelated':
                score += 0.25
                total_score += 1
            else:
                score += 0
                total_score += 1

    get_score = score * 100 / total_score 
    print('score:', get_score)
    
    return get_score

# select features

In [32]:
# selected_features = ['cos']
# iteration  = 5000
# w0 = 0
# alpha = 0.01

# W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', alpha, iteration, w0)

# score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'hold_out_processed_feat.csv', printResult = True)

start train using logistic regression:
mse: 5524.79939994
mse: 5170.75571854
mse: 7131.29494415
mse: 5132.17000378
find unrelated: 29550 / 29647 accuracy: 0.9967281681114447
find other: 4939 / 10703 accuracy: 0.4614594039054471
score: 71.54942795235927


In [33]:
# w = pd.DataFrame(np.column_stack([W_unrelated, W_agree, W_disagree,W_discuss]), \
#                  columns=['unrelated', 'agree', 'disagree', 'discuss'])
# print('w:',w)
# a = w.values
# a = [abs(v) for v in a]
# print('total contribute:',np.sum(a, axis = 1))
# print('best contributor:', np.sum(a, axis = 1).argmin())

# import os
# os.system('say "you are very beautiful"')

w:    unrelated     agree   disagree  discuss
0  -11.01687 -3.175632 -11.893544   -1.627
total contribute: [ 27.71304625]
best contributor: 0


0

# tune hypterparameter

In [14]:
# selected_features = ['kl','overlap_ratio','cos','dis', 'overlap_count', 'uncertain']
# iteration  = 5000
# w0 = 0
# alpha = 0.01

# W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', alpha, iteration, w0)

# score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'test_processed_feat.csv', printResult = True)

start train using logistic regression:


  # This is added back by InteractiveShellApp.init_path()


mse: 797.313110285
mse: 1549.45846344
mse: 6218.81191721
mse: 9603.45043371
find unrelated: 18196 / 18349 accuracy: 0.9916616709357459
find other: 3320 / 7064 accuracy: 0.46998867497168745
score: 72.38493723849372


In [5]:
# # ALPHA = list(np.arange(0,1,0.1))
# Iter = list(np.arange(1000,8000,1000))
# score_s = []
# iteration = 5000
# alpha = 0.a1
# w0 = -0.04

# for i,iteration in enumerate(Iter):
#     W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', alpha, iteration, w0)
#     score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'hold_out_processed_feat.csv', printInfo=False)
#     score_s.append(score)

# plt.figure()
# plt.plot(range(len(score_s)), score_s, 'g*')

In [None]:
# Iter[score_s.index(max(score_s))]
# import os
# os.system('say "you are very beautiful"')

# evaluate on test

In [38]:
W0 = 0
iteration = 5000
alpha = 0.01
selected_features = ['overlap_count','cos','kl']

W_agree, W_disagree, W_discuss, W_unrelated = train('train_processed_feat.csv', alpha, iteration, W0)

score = validate(W_agree, W_disagree, W_discuss, W_unrelated, 'test_processed_feat.csv', printResult=True)

start train using logistic regression:


  # This is added back by InteractiveShellApp.init_path()


mse: 1634.36242892
mse: 1831.04387237
mse: 378.579519692
mse: 1437.78693188
find unrelated: 14014 / 18349 accuracy: 0.7637473431794648
find other: 4406 / 7064 accuracy: 0.6237259343148358
score: 73.37195579873404


In [39]:
import os
os.system('say "you are very beautiful"')

0