In [8]:
import numpy as np
import pandas as pd

# reading positive and negative csv files into dataframe
pos_file = "pos_feat_vec.csv"
neg_file = "neg_feat_vec.csv"

pos = pd.read_csv(pos_file, header = 0)
neg = pd.read_csv(neg_file, header = 0)
data = pd.concat([pos, neg])

In [9]:
# shuffling the data frames
from sklearn.utils import shuffle

data = shuffle(data, random_state=0)

In [10]:
names = data.ix[:,0]
labels = data.ix[:, len(data.columns) - 1]
features = data.ix[:, 1: len(data.columns) - 1]

features = features.as_matrix()
labels = labels.as_matrix()

In [11]:
# training decision trees and performing cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

dt = DecisionTreeClassifier(criterion = 'gini', max_depth = 15)
fold = 10

accuracy = cross_val_score(dt, features, labels, cv = fold)
precision = cross_val_score(dt, features, labels, cv = fold, scoring='precision')
recall = cross_val_score(dt, features, labels, cv = fold, scoring='recall')

print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

('Accuracy: ', 0.90715083685407871)
('Precision: ', 0.88181900957726977)
('Recall: ', 0.91061694663698312)


In [23]:
np.shape(features)

(5439L, 24L)

In [29]:
train_set,set2,test_set = np.split(features,3,0)
train_set = np.append(train_set,set2,0)

dt2 = DecisionTreeClassifier(criterion = 'gini', max_depth = 10)

dt2.fit()
print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

In [5]:
# training linear regressors and performing cross-validation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

fold = 10
threshold = 0.3
rg = LinearRegression()
kf = KFold(n_splits = fold, shuffle = True)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(features):
    features_train, features_test = features[train_index], features[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    rg.fit(features_train, labels_train)
    result = rg.predict(features_test)
    
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for k in range(0, len(result)):
        if labels_test[k] == 1:
            if result[k] > threshold:
                tp += 1
            else:
                fn += 1
        else:
            if result[k] > threshold:
                fp += 1
            else:
                tn += 1

    accuracy.append((tp + tn) / len(result))
    precision.append(tp / (tp + fn))
    recall.append(tp / (tp + fp))

print('Accuracy: ', sum(accuracy) / fold)
print('Precision: ', sum(precision) / fold)
print('Recall: ', sum(recall) / fold)

('Accuracy: ', 0)
('Precision: ', 0)
('Recall: ', 0)


In [6]:
# training logistic regressors and performing cross-validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

lr = LogisticRegression()
fold = 10

accuracy = cross_val_score(dt, features, labels, cv = fold)
precision = cross_val_score(dt, features, labels, cv = fold, scoring='precision')
recall = cross_val_score(dt, features, labels, cv = fold, scoring='recall')

print('Accuracy: ', accuracy.sum() / fold)
print('Precision: ', precision.sum() / fold)
print('Recall: ', recall.sum() / fold)

('Accuracy: ', 0.90549608655616942)
('Precision: ', 0.88488615730188458)
('Recall: ', 0.91225629089927818)


In [7]:
# reading test csv files into dataframe
test_file = "test_feat_vec.csv"
test = pd.read_csv(test_file, header = 0)

names = test.ix[:,0]
positions = test.ix[:, 1: 2]
features = test.ix[:, 3:]

features = features.as_matrix()
labels = labels.as_matrix()

dt_result = dt.predict(features)
rg_result = rg.predict(features)
lr_result = lr.predict(features)

AttributeError: 'numpy.ndarray' object has no attribute 'as_matrix'