In [1]:
import numpy as np
import pandas as pd

# reading positive and negative csv files into dataframe
pos_file = "Downloads/pos_feat_vec.csv"
neg_file = "Downloads/neg_feat_vec.csv"

pos = pd.read_csv(pos_file, header = 0)
neg = pd.read_csv(neg_file, header = 0)
data = pd.concat([pos, neg])

In [2]:
# shuffling the data frames
from sklearn.utils import shuffle

data = shuffle(data, random_state=0)

In [3]:
names = data.ix[:,0]
labels = data.ix[:, len(data.columns) - 1]
features = data.ix[:, 1: len(data.columns) - 1]
# features = features.drop('starts_with_brand', axis = 1)

features = features.as_matrix()
labels = labels.as_matrix()

In [4]:
# training decision trees and performing cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

fold = 10
threshold = 0.85

dt = DecisionTreeClassifier(criterion = 'entropy', max_depth = 10)
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    dt.fit(features_train, labels_train)
    result = dt.predict_proba(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k][1] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k][1] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fp))
    recall.append(tp / (tp + fn))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

Accuracy:  0.8499732558769365
Precision:  0.9085941457683895
Recall:  0.7414671588523094


In [5]:
# training random forests and performing cross-validation
from sklearn.ensemble import RandomForestClassifier

fold = 10
threshold = 0.85

rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 10, max_depth = 15)
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    rf.fit(features_train, labels_train)
    result = rf.predict_proba(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k][1] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k][1] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fp))
    recall.append(tp / (tp + fn))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

Accuracy:  0.8391263135088289
Precision:  0.9578731616657912
Recall:  0.6705862055814731


In [6]:
# training SVMs and performing cross-validation
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

s = svm.SVC()
fold = 10

s.fit(features, labels)

accuracy = cross_val_score(s, features, labels, cv = fold)
precision = cross_val_score(s, features, labels, cv = fold, scoring = 'precision')
recall = cross_val_score(s, features, labels, cv = fold, scoring = 'recall')

print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

Accuracy:  0.885640098581
Precision:  0.878578105693
Recall:  0.865109964245


In [7]:
# training linear regressors and performing cross-validation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

fold = 10
threshold = 0.5
rg = LinearRegression()
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    rg.fit(features_train, labels_train)
    result = rg.predict(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fp))
    recall.append(tp / (tp + fn))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

Accuracy:  0.8497921406131512
Precision:  0.8237657812216856
Recall:  0.8465529012084134


In [8]:
# training logistic regressors and performing cross-validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

lr = LogisticRegression()
fold = 10

lr.fit(features, labels)

accuracy = cross_val_score(dt, features, labels, cv = fold)
precision = cross_val_score(dt, features, labels, cv = fold, scoring='precision')
recall = cross_val_score(dt, features, labels, cv = fold, scoring='recall')

print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

Accuracy:  0.894277773264
Precision:  0.878551620938
Recall:  0.890120083654


In [9]:
# reading test csv files into dataframe
test_file = 'Downloads/test_feat_vec.csv'
test = pd.read_csv(test_file, header = 0)

test_names = test.ix[:,0]
test_positions = test.ix[:, 1: 3]
test_features = test.ix[:, 3:]

test_names = test_names.as_matrix()
test_positions = test_positions.as_matrix()
test_features = test_features.as_matrix()

dt_result = dt.predict_proba(test_features)
rf_result = rf.predict_proba(test_features)
s.result = s.predict(test_features)
rg_result = rg.predict(test_features)
lr_result = lr.predict(test_features)

In [21]:
product_list = 'Downloads/test_product_list.txt'

with open(product_list, 'r') as f:
    ref = f.readlines()

ref_names = []
ref_positions = []

for i, product in enumerate(ref):
    ref[i] = product.split()
    product_name = ' '.join(word for word in ref[i][:-2])
    start_pos = int(ref[i][-2])
    end_pos = int(ref[i][-1])

    ref_names.append(product_name)
    ref_positions.append([start_pos, end_pos])

In [32]:
tp = 0
tn = 0
fp = 0
fn = 0

threshold = 0.85

for i, test_name in enumerate(test_names):
    found = False
    
    for j, ref_name in enumerate(ref_names):
        if test_name == ref_name and test_positions[i][0] == ref_positions[j][0] and test_positions[i][1] == ref_positions[j][1]:
            found = True
            
            if rf_result[i][1] > threshold:
                tp += 1
            else:
                fn += 1
            break
    
    if not found:
        if rf_result[i][1] > threshold:
            fp += 1
        else:
            tn += 1

print(tp, fp, tn, fn)
print('Total number of test examples: ', tp + fp + tn + fn)

accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)

761 1014 5362 137
Total number of test examples:  7274
Accuracy:  0.841765191091559
Precision:  0.4287323943661972
Recall:  0.8474387527839644
