In [597]:
import numpy as np
import pandas as pd

# reading positive and negative examples
pos_file = "Downloads/pos_feat_vec.csv"
neg_file = "Downloads/neg_feat_vec.csv"

pos = pd.read_csv(pos_file, header = 0)
neg = pd.read_csv(neg_file, header = 0)

# combine positive and negative training examples into a full training set
data = pd.concat([pos, neg])

# shuffling the data to avoid bias
from sklearn.utils import shuffle

data = shuffle(data, random_state=0)

In [624]:
# optimizing the classifiers
#size_train = round(0.7 * len(data))

#train = data.iloc[0:size_train + 1, :]
#tune = data.iloc[size_train + 1:, :]

In [598]:
names = data.ix[:,0]
#name = train.ix[:,0]
labels = data.ix[:, len(data.columns) - 1]
#labels = train.ix[:, len(data.columns) - 1]
features = data.ix[:, 1: len(data.columns) - 1]
#features = train.ix[:, 1: len(data.columns) - 1]

# convert pandas dataframes into numpy arrays (compatible with scikit-learn)
features = features.as_matrix()
labels = labels.as_matrix()

In [557]:
# training decision trees and performing cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

fold = 10
threshold = 0.85

dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = 10)
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    dt.fit(features_train, labels_train)
    result = dt.predict_proba(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k][1] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k][1] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fp))
    recall.append(tp / (tp + fn))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

Accuracy:  0.8227997643035847
Precision:  0.9175089411333719
Recall:  0.5847183668759286


In [606]:
# training random forests and performing cross-validation
from sklearn.ensemble import RandomForestClassifier

fold = 10
threshold = 0.85

rf = RandomForestClassifier(criterion = 'gini', n_estimators = 10, max_depth = 15)
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    rf.fit(features_train, labels_train)
    result = rf.predict_proba(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k][1] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k][1] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fp))
    recall.append(tp / (tp + fn))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

Accuracy:  0.8311853404557441
Precision:  0.9598585275461622
Recall:  0.5787101394836041


In [545]:
# training SVMs and performing cross-validation
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

s = svm.SVC()
fold = 10

s.fit(features, labels)

accuracy = cross_val_score(s, features, labels, cv = fold)
precision = cross_val_score(s, features, labels, cv = fold, scoring = 'precision')
recall = cross_val_score(s, features, labels, cv = fold, scoring = 'recall')

print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

Accuracy:  0.875603247587
Precision:  0.84368853349
Recall:  0.82533225393


In [546]:
# training linear regressors and performing cross-validation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

fold = 10
threshold = 0.5
rg = LinearRegression()
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    rg.fit(features_train, labels_train)
    result = rg.predict(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fp))
    recall.append(tp / (tp + fn))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

Accuracy:  0.8432959342368361
Precision:  0.7967262460717449
Recall:  0.787193239979846


In [593]:
# training logistic regressors and performing cross-validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

lr = LogisticRegression()
fold = 10

lr.fit(features, labels)

accuracy = cross_val_score(dt, features, labels, cv = fold)
precision = cross_val_score(dt, features, labels, cv = fold, scoring='precision')
recall = cross_val_score(dt, features, labels, cv = fold, scoring='recall')

print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

Accuracy:  0.876534441622
Precision:  0.861631450873
Recall:  0.804422181745


In [615]:
# reading test examples
test_file = 'Downloads/test_feat_vec.csv'
test = pd.read_csv(test_file, header = 0)

test_names = test.ix[:,0]
#test_names = tune.ix[:,0]
test_positions = test.ix[:, 1: 3]
#test_positions = tune.ix[:, 1: 3]
test_features = test.ix[:, 3:]
#test_featuress = tune.ix[:, 3:]

test_names = test_names.as_matrix()
#test_names = test_names.as_matrix()
test_positions = test_positions.as_matrix()
#test_positions = test_positions.as_matrix()
test_features = test_features.as_matrix()
#test_features = test_features.as_matrix()

# making prediction on test examples
rf_result = rf.predict_proba(test_features)

In [616]:
# compile a list of product names in test documents (for computing precision and recall)
product_list = 'Downloads/test_product_list.txt'

with open(product_list, 'r') as f:
    ref = f.readlines()

ref_names = []
ref_positions = []

for i, product in enumerate(ref):
    ref[i] = product.split()
    product_name = ' '.join(word for word in ref[i][:-2])
    start_pos = int(ref[i][-2])
    end_pos = int(ref[i][-1])

    ref_names.append(product_name)
    ref_positions.append([start_pos, end_pos])

In [617]:
# computing precision and recall of the classifier on the test set
tp = 0
tn = 0
fp = 0
fn = 0

#threshold = 0.85

for i, test_name in enumerate(test_names):   
    found = False
    
    for j, ref_name in enumerate(ref_names):
        if test_name == ref_name and test_positions[i][0] == ref_positions[j][0] and test_positions[i][1] == ref_positions[j][1]:
            found = True
            
            #if rf_result[i][0] > threshold:
            if s_result[i] == 1:
                tp += 1
            else:
                fn += 1
            break
    
    if not found:
        #if rf_result[i][1] > threshold:
        if s_result[i] == 1:
            fp += 1
        else:
            tn += 1

print(tp, fp, tn, fn)
print('Total number of test examples: ', tp + fp + tn + fn)

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / len(ref_names)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)

706 550 8073 190
Total number of test examples:  9519
Accuracy:  0.9222607416745456
Precision:  0.5621019108280255
Recall:  0.566158781074579


In [516]:
# print out all predictions
with open('Downloads/test_predict.txt', 'w') as f:
    for i, test_name in enumerate(test_names):
        print(test_name, rf_result[i][0], rf_result[i][1], file = f)

In [517]:
# print out examples predicted as true (as reference for improving precision)
predicted_true = []

with open('Downloads/test_predict_positives.txt', 'w') as f:
    for i, test_name in enumerate(test_names):
        if rf_result[i][1] > threshold:
            predicted_true.append([test_name, rf_result[i][0], rf_result[i][1]])
            print(test_name, rf_result[i][0], rf_result[i][1], file = f)

predicted_true = sorted(predicted_true, key=lambda prediction: prediction[2], reverse = True)

In [518]:
with open('Downloads/sorted_test_predict_positives.txt', 'w') as f:
    for tuple in predicted_true:
        found = False
        
        for j, ref_name in enumerate(ref_names):
            if tuple[0] == ref_name:
                print('true example: ', tuple[0], tuple[1], tuple[2], file = f)
                found = True
                break
        
        if not found:
            print('false example: ', tuple[0], tuple[1], tuple[2], file = f)

In [519]:
# print out examples predicted as false (as reference for improving recall)
predicted_false = []

with open('Downloads/test_predict_negatives.txt', 'w') as f:
    for i, test_name in enumerate(test_names):
        if rf_result[i][1] < threshold:
            predicted_false.append([test_name, rf_result[i][0], rf_result[i][1]])
            print(test_name, rf_result[i][0], rf_result[i][1], file = f)

predicted_false = sorted(predicted_false, key=lambda prediction: prediction[2], reverse = True)

In [520]:
with open('Downloads/sorted_test_predict_negatives_>0.7.txt', 'w') as f:
    for tuple in predicted_false:
        if tuple[2] < 0.7:
            break
        
        found = False
        
        for j, ref_name in enumerate(ref_names):
            if tuple[0] == ref_name:
                print('true example: ', tuple[0], tuple[1], tuple[2], file = f)
                found = True
                break
        
        if not found:
            print('false example: ', tuple[0], tuple[1], tuple[2], file = f)