In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from pythainlp.corpus import stopwords
from pythainlp.tokenize import word_tokenize
import re
from sklearn.metrics import recall_score, precision_score, f1_score

i=0
def tokenizer(tweet):
    global i
    i = i+1
    if i%500==0: print(i)
    pattern = re.compile(r"[^\u0E00-\u0E4C]|^'|'$|''|\u0E46|\u0E3F")
    char_to_remove = re.findall(pattern, tweet)
    list_with_char_removed = [char for char in tweet if not char in char_to_remove]
    result_string = ''.join(list_with_char_removed)
    return word_tokenize(result_string, engine='deepcut')
        
#filename = "./dataset/forCLFnohash8000.csv"
filename = "./dataset/tweet1000.csv"
test_filename = "./dataset/forCLFnohash1label1000.csv"

## Read a file and make a numpy array of labels
df = pd.read_csv(filename)
X = np.array(df['tweet'])
y1 = np.array(df['anger'])
y2 = np.array(df['anticipation'])
y3 = np.array(df['disgust'])
y4 = np.array(df['fear'])
y5 = np.array(df['joy'])
y6 = np.array(df['sadness'])
y7 = np.array(df['surprise'])
y8 = np.array(df['trust'])

for i in range(len(X)):
    if type(X[i]) == float:
        print(i,"error!!!")
        X[i]=""
        y1[i]=0
    #if i%100==0: print(i)

df = pd.read_csv(test_filename)
Xt = np.array(df['tweet'])
yt = np.array(df['label'])


## Extract features from a tweet with TfIdf
v = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('thai'))
train_data_feature = v.fit_transform(X).toarray()
print(v.get_feature_names())
        
X = train_data_feature

1000


Using TensorFlow backend.


1500
['ก', 'กก', 'กกก', 'กกกก', 'กกกกก', 'กกกกกกกก', 'กกกกกกกกกกก', 'กกกกกกกกแคท', 'กกกกกกกโมโห', 'กกกกกตกใจ', 'กกกกลัว', 'กกกตกใจ', 'กกฮ่าาา', 'กกิน', 'กฎ', 'กด', 'กดดัน', 'กดไลค์', 'กพปี', 'กยุวดี', 'กรรม', 'กรรมบังเซง', 'กรวด', 'กรอก', 'กรอคอย', 'กระจก', 'กระซิก', 'กระซิบ', 'กระทง', 'กระทะ', 'กระทู้', 'กระป๋อง', 'กระแดะ', 'กระแทก', 'กระแส', 'กระโปรง', 'กริ้ว', 'กรีด', 'กรีดร้อง', 'กรี่', 'กรี้ดห้อง', 'กรี๊ด', 'กรี๊ดด', 'กรี๊ดดดดด', 'กรุงเทพ', 'กรุมิไป', 'กรุ่น', 'กรุ๊ป', 'กรู', 'กรูหล่ะ', 'กรุ๊ป', 'กลอง', 'กลับคืน', 'กลัว', 'กลัว้', 'กลาง', 'กลาย', 'กล่อง', 'กล่าวหา', 'กล้อง', 'กล้า', 'กล้าย', 'กวน', 'กวนจ้าา', 'กวาด', 'กอง', 'กอด', 'กอฟตกใจ', 'กอีเหี้ยยยที', 'กะ', 'กะจิตกะใจดู', 'กะหรี่', 'กะเต่า', 'กะเรา', 'กะเสียง', 'กักขัง', 'กังมากกกโมโห', 'กังวล', 'กัซบั้ม', 'กัด', 'กันจังบัตร', 'กับข้าว', 'กาก', 'กางเกง', 'กามิจิแรง', 'กาย', 'การท่องเที่ยว', 'การบ้าน', 'การ์ด', 'การ์ดคยอง', 'การ์ตูน', 'กาล', 'กาลีบ้านกาลีเมือง', 'กาลเทศะ', 'กาเเลก', 'กาแฟ', 'กำลังใจ', 'กำเดา', 'กิง', 'กิจกรรม

In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score

In [5]:
def train_model(X,y,n,C):
    features = X
    label =y
    
    R = 0.0
    P = 0.0
    F = 0.0
    
    skf = StratifiedKFold(n_splits=n, random_state=30, shuffle=True)
    
    for train_index, test_index in skf.split(features, label):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = label[train_index], label[test_index]
        tclf = svm.LinearSVC(class_weight='balanced', C=C, random_state=0).fit(X_train,y_train)
        predict = tclf.predict(X_test)
        
        r=recall_score(y_test,predict)
        p=precision_score(y_test,predict)
        f=f1_score(y_test,predict)
        
        R = R + r
        P = P + p
        F = F + f
        
        print('Prediction')
        print(predict)
        print('y')
        print(y_test)
        print('scores: recall, precision, f1')
        print(r,p,f)
        print()
        
    print()
    print('Average')
    print(R/n,P/n,F/n)

In [6]:
train_model(X,y1,5,1)

Prediction
[0 0 0 1 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
y
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
scores: recall, precision, f1
0.6 1.0 0.75

Prediction
[1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [170]:
X_train = X[:2000]
y_train = y1[:2000]
X_test = Xt[:250]
y_test = yt[:250]
clf = svm.LinearSVC().fit(X_train,y_train)
print(clf.predict(v.transform(["โกรธแล้ว","ดีใจมากเลย","หิวข้าว"]).toarray()))

16000
[1 0 1]


In [171]:
prediction = clf.predict(v.transform(X_test))
print(prediction)
print(y_test)
print(len(prediction),len(y_test))

[0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 0 1 1 1 1
 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0
 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1
 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

In [172]:
def recall_scoren(predict,correct,emo):
    count, total = 0,0
    for i in range(len(correct)):
        if correct[i]==emo:
            total = total + 1
            if predict[i]==1: count = count+1
    if total is not 0:
        return [count,total,count/total]
    else: 
        return count


recall_scoren(prediction,y_test,1)
        

[86, 125, 0.688]

In [173]:
def precision_scoren(predict,correct,emo):
    count, total = 0,0
    for i in range(len(correct)):
        if predict[i]==1:
            total = total+1
            if correct[i]==emo: count = count+1
    if total is not 0:
        return [count,total,count/total]
    else: 
        return count

precision_scoren(prediction,y_test,1)

[86, 112, 0.7678571428571429]

In [174]:
print(clf.predict(v.transform(["โกด"]).toarray()))

[0]


In [175]:
for k in range(len(Xt)):
    for m in range(len(X)):
        if Xt[k] in X[m] == True: print(k)

  This is separate from the ipykernel package so we can avoid doing imports until


In [176]:
X_train2 = X[1000:3000]
y_train2 = y2[1000:3000]
clf2 = svm.LinearSVC().fit(X_train2,y_train2)
print(clf2.predict(v.transform(["โกรธแล้ว","ดีใจมากเลย","หิวข้าว"]).toarray()))

[0 1 0]


In [179]:
X_test2 = Xt[250:500]
y_test2 = yt[250:500]
prediction = clf2.predict(v.transform(X_test2))
print(prediction)
print(y_test2)
print(len(prediction),len(y_test))
print(recall_scoren(prediction,y_test2,2))
print(precision_scoren(prediction,y_test2,2))

17000
[1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 0 0
 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1
 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 0 0 0
 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 1 0 0
 1 0 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 1 1 1
 0 0 1 0 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1]
[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4

In [180]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X[:2000], y1[:2000])
for train_index, test_index in skf.split(X[:2000], y1[:2000]):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y1[train_index], y1[test_index]
    tclf = svm.LinearSVC().fit(X_train,y_train)
    predict = tclf.predict(X_test)
    
    print(tclf.predict(X_test))
    print(y_test)
    print(recall_score(y_test,predict),precision_score(y_test,predict),f1_score(y_test,predict))
    print("--- predict ---")
    x_t,y_t = Xt[:200], yt[:200]
    prediction = tclf.predict(v.transform(x_t))
    print(prediction)
    print(y_t)
    print(recall_scoren(prediction,y_t,1))
    print(precision_scoren(prediction,y_t,1))
    print("----------------------------------------------------------")
clf1 = tclf

[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1
 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1
 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0
 1 1 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0
 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1
 1 0 0 1 0 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0
 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

[0 0 1 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1
 1 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0
 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1
 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[83, 125, 0.664]
[83, 99, 0.8383838383838383]
----------------------------------------------------------
[0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 0 1

In [None]:
X_train_2 = X[1000:3000]
y_train_2 = y2[1000:3000]
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_2, y_train_2)
for train_index, test_index in skf.split(X_train_2, y_train_2):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_train_2[train_index], X_train_2[test_index]
    y_train, y_test = y_train_2[train_index], y_train_2[test_index]
    tclf = svm.LinearSVC().fit(X_train,y_train)
    predict = tclf.predict(X_test)
    
    print(tclf.predict(X_test))
    print(y_test)
    print(recall_score(y_test,predict),precision_score(y_test,predict),f1_score(y_test,predict))
    print("--- predict ---")
    x_t,y_t = Xt[125:325], yt[125:325]
    prediction = tclf.predict(v.transform(x_t))
    print(prediction)
    print(y_t)
    print(recall_scoren(prediction,y_t,2))
    print(precision_scoren(prediction,y_t,2))
    print("----------------------------------------------------------")
clf2 = tclf

In [None]:
y2[1000:3000]

In [None]:
clf1.predict(v.transform(Xt[500:625]))

In [None]:
clf2.predict(v.transform(Xt[500:625]))

In [None]:
clf1.decision_function(v.transform(Xt[500:625]))

In [None]:
clf2.decision_function(v.transform(Xt[500:625]))

In [None]:
X_train_3 = X[3000:5000]
y_train_3 = y4[3000:5000]
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_3, y_train_3)
for train_index, test_index in skf.split(X_train_3, y_train_3):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_train_3[train_index], X_train_3[test_index]
    y_train, y_test = y_train_3[train_index], y_train_3[test_index]
    tclf = svm.LinearSVC().fit(X_train,y_train)
    predict = tclf.predict(X_test)
    
    print(tclf.predict(X_test))
    print(y_test)
    print(recall_score(y_test,predict),precision_score(y_test,predict),f1_score(y_test,predict))
    #print("--- predict ---")
    #x_t,y_t = Xt[125:325], yt[125:325]
    #prediction = tclf.predict(v.transform(x_t))
    #print(prediction)
    #print(y_t)
    #print(recall_scoren(prediction,y_t,2))
    #print(precision_scoren(prediction,y_t,2))
    #print("----------------------------------------------------------")
clf3 = tclf

In [None]:
print(clf1.decision_function(v.transform(Xt[0:10])))
print(clf2.decision_function(v.transform(Xt[0:10])))
print(clf3.decision_function(v.transform(Xt[0:10])))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(Xt, yt)
for train_index, test_index in skf.split(Xt, yt):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = Xt[train_index], Xt[test_index]
    y_train, y_test = yt[train_index], yt[test_index]
    tclf = KNeighborsClassifier().fit(X_train,y_train)
    predict = tclf.predict(X_test)
    
    print(tclf.predict(X_test))
    print(y_test)
    #print(recall_score(y_test,predict),precision_score(y_test,predict),f1_score(y_test,predict))

In [181]:
def train_8_clfs(X,y,clfs):
    for r in range(8):
        start = r%8*1000
        end = (r+1)%8*1000+1000
        
        if start < end:
            X_train = X[start:end]
            y_train = y[r][start:end]
        else:
            X_part1, y_part1 = X[start:], y[r][start:]
            X_part2, y_part2 = X[:end],y[r][:end]
            X_train = np.concatenate([X_part1,X_part2])
            y_train = np.concatenate([y_part1,y_part2])
        print(y_train)
        clf = svm.LinearSVC().fit(X_train,y_train)
        clfs.append(clf)

clfs=[]
y3[2000:3000]=1
train_8_clfs(X,[y1,y2,y3,y4,y5,y6,y7,y8],clfs)

[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]
[1 1 1 ..., 0 0 0]


In [184]:
def predict_p(clfs,text):
    max=0
    emo = ["anger","anticipation","disgust","fear","joy","sadness","surprise","trust"]
    text = v.transform(text)
    for r in range(8):
        print(emo[r],clfs[r].predict(text),clfs[r].decision_function(text))
              
predict_p(clfs,Xt[:5])

anger [0 0 1 1 1] [-0.38112054 -0.33662243  0.07682673  0.49406477  0.00117168]
anticipation [1 1 1 0 1] [ 0.05186482  0.38963827  0.168391   -0.14646513  0.06469307]
disgust [1 1 0 1 0] [ 0.26844661  0.23713558 -0.24899462  0.0898939  -0.4353576 ]
fear [0 1 0 0 1] [-0.26422217  0.17448407 -0.22957738 -0.59448457  0.42171424]
joy [1 0 1 1 0] [ 0.14253386 -0.36117701  0.28001916  0.46710313 -0.05160825]
sadness [0 0 1 0 1] [-0.46778431 -0.96715997  0.3392734  -0.16296313  0.33233261]
surprise [0 1 0 1 1] [-0.3959835   0.71359014 -0.03277569  0.41692955  0.11953487]
trust [1 0 0 0 0] [ 0.70364349 -0.05912671 -0.41388288 -0.6967801  -0.6986113 ]


In [188]:
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.LinearSVC(random_state=0))
scores = cross_val_score(clf,X[:],y1[:],cv=5)
print(scores, scores.mean())

[ 0.873125  0.859375  0.869375  0.87125   0.87875 ] 0.870375
