In [4]:
import os
import numpy as np
from scipy import sparse
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.svm import SVC, LinearSVC
import pickle

names = ["A?C.*",
       "A?U.*",
       "A?K.*",
       "G.*",
       "A?P.*",
       "R.*",
       "W.*",
       "Am.*", ]

In [2]:
with open("result_list", "rb") as f:
    classes = pickle.load(f)
for i, cla in enumerate(classes):
    print("Class {} has {} samples.".format(names[i], len(cla)))

Class A?C.* has 24121 samples.
Class A?U.* has 8379 samples.
Class A?K.* has 10574 samples.
Class G.* has 3505 samples.
Class A?P.* has 2661 samples.
Class R.* has 1060 samples.
Class W.* has 1062 samples.
Class Am.* has 599 samples.


In [2]:
def SVM_test(path):
    with open(os.path.join(path, "train_perm"), "r") as f:
        train_ans = literal_eval(f.read())
    with open(os.path.join(path, "test_perm"), "r") as f:
        test_ans = literal_eval(f.read())
    X_test = TfidfTransformer(use_idf=True).fit_transform(sparse.load_npz(os.path.join(path, "sparse_test.npz")))
    X_train = TfidfTransformer(use_idf=True).fit_transform(sparse.load_npz(os.path.join(path, "sparse_train.npz")))
    print(X_test.shape)
    print(X_train.shape)
    for i, name in enumerate(names):
        print(name)
        Y_train = np.array([1 if x == i else 0 for x in train_ans])
        Y_test = np.array([1 if x == i else 0 for x in test_ans])
        cla = SVC(kernel="linear", C=100)
        cla.fit(X_train, Y_train)
        Y_pre =cla.predict(X_test)
        print("accuracy_score") 
        print(accuracy_score(Y_test, Y_pre))
        print("classification_report")
        print(classification_report(Y_test, Y_pre))
        print("micro_report")
        print(precision_recall_fscore_support(Y_test, Y_pre, average='micro'))
        print("macro_report")
        print(precision_recall_fscore_support(Y_test, Y_pre, average='macro'))


In [3]:
SVM_test("trans")

(11647, 192519)
(34927, 192519)
A?C.*
accuracy_score
0.951489653988
classification_report
             precision    recall  f1-score   support

          0       0.95      0.96      0.96      6335
          1       0.95      0.95      0.95      5312

avg / total       0.95      0.95      0.95     11647

micro_report
(0.95148965398815144, 0.95148965398815144, 0.95148965398815144, None)
macro_report
(0.95115596069587993, 0.95105929836156, 0.95110727808649786, None)
A?U.*
accuracy_score
0.999141409805
classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9604
          1       1.00      1.00      1.00      2043

avg / total       1.00      1.00      1.00     11647

micro_report
(0.99914140980510002, 0.99914140980510002, 0.99914140980510002, None)
macro_report
(0.99813393075232049, 0.99890135412251746, 0.99851714368797451, None)
A?K.*
accuracy_score
0.99665149824
classification_report
             precision    recall  f1

In [4]:
SVM_test("norm")

(11647, 302030)
(34927, 302030)
A?C.*
accuracy_score
0.960934146132
classification_report
             precision    recall  f1-score   support

          0       0.97      0.96      0.96      6335
          1       0.95      0.96      0.96      5312

avg / total       0.96      0.96      0.96     11647

micro_report
(0.96093414613205119, 0.96093414613205119, 0.96093414613205119, None)
macro_report
(0.96029135334163818, 0.96109401596124044, 0.96066580188189965, None)
A?U.*
accuracy_score
0.999313127844
classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9604
          1       1.00      1.00      1.00      2043

avg / total       1.00      1.00      1.00     11647

micro_report
(0.99931312784408, 0.99931312784408, 0.99931312784408, None)
macro_report
(0.99958385351643786, 0.99804209495839458, 0.99881096715627282, None)
A?K.*
accuracy_score
0.997166652357
classification_report
             precision    recall  f1-scor

In [5]:
def lin_SVM_test(path):
    with open(os.path.join(path, "train_perm"), "r") as f:
        Y_train = literal_eval(f.read())
    with open(os.path.join(path, "test_perm"), "r") as f:
        Y_test = literal_eval(f.read())
    X_test = TfidfTransformer(use_idf=True).fit_transform(sparse.load_npz(os.path.join(path, "sparse_test.npz")))
    X_train = TfidfTransformer(use_idf=True).fit_transform(sparse.load_npz(os.path.join(path, "sparse_train.npz")))
    print(X_test.shape)
    print(X_train.shape)
    for i, name in enumerate(names):
        print(name)
        cla = LinearSVC(C=100)
        cla.fit(X_train, Y_train)
        Y_pre = cla.predict(X_test)
        print("accuracy_score") 
        print(accuracy_score(Y_test, Y_pre))
        print("classification_report")
        print(classification_report(Y_test, Y_pre))
        print("micro_report")
        print(precision_recall_fscore_support(Y_test, Y_pre, average='micro'))
        print("macro_report")
        print(precision_recall_fscore_support(Y_test, Y_pre, average='macro'))


In [7]:
lin_SVM_test("norm")

(11647, 302030)
(34927, 302030)
A?C.*
accuracy_score
0.967888726711
classification_report
             precision    recall  f1-score   support

          0       0.96      0.98      0.97      5312
          1       1.00      1.00      1.00      2043
          2       0.99      0.99      0.99      2402
          3       0.86      0.79      0.82       745
          4       0.97      0.93      0.95       561
          5       0.96      0.95      0.96       230
          6       0.98      0.94      0.96       246
          7       0.98      0.95      0.97       108

avg / total       0.97      0.97      0.97     11647

micro_report
(0.967888726710741, 0.967888726710741, 0.967888726710741, None)
macro_report
(0.96207765027784808, 0.94157367873483611, 0.95155207296648325, None)
A?U.*


KeyboardInterrupt: 

In [6]:
lin_SVM_test("trans")

(11647, 192519)
(34927, 192519)
A?C.*
accuracy_score
0.959474542801
classification_report
             precision    recall  f1-score   support

          0       0.96      0.96      0.96      5312
          1       1.00      1.00      1.00      2043
          2       0.99      1.00      0.99      2402
          3       0.77      0.74      0.76       745
          4       0.97      0.95      0.96       561
          5       0.95      0.92      0.94       230
          6       0.97      0.93      0.95       246
          7       0.95      0.95      0.95       108

avg / total       0.96      0.96      0.96     11647

micro_report
(0.9594745428007212, 0.9594745428007212, 0.9594745428007212, None)
macro_report
(0.94403062311090036, 0.93189794851959262, 0.93784550736366068, None)
A?U.*
accuracy_score
0.959474542801
classification_report
             precision    recall  f1-score   support

          0       0.96      0.96      0.96      5312
          1       1.00      1.00      1.00      2

KeyboardInterrupt: 