In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.model_selection import train_test_split

from lda_directed import DirectedLDA

import utils as u

from co_train import Co_Train

from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')

In [3]:
labeled_corpus = df[(df['agg_label'] != -1) & (df['agg_label'] < 90)]['lemma_delivered']
#labeled_corpus = df[(df['agg_label'] != -1)]['lemma_delivered']
target = df[(df['agg_label'] != -1) & (df['agg_label'] < 90)]['agg_label']
#target = df[(df['agg_label'] != -1)]['agg_label']

unlabeled_corpus = df[(df['agg_label'] == -1)]['lemma_delivered']

In [4]:
train_X, validation_X, train_y, validation_y = train_test_split(
    labeled_corpus,
    target,
    test_size=0.33,
    random_state=1,
    stratify=target)

In [9]:
def prunor(X, y, v_X, v_y, prune_mask, best_prune_mask, best_acc, init_acc):
    
    vec = TfidfVectorizer(**{'tokenizer': lambda x: x.split(), 'lowercase': False, 'max_df': 0.3}) 
    vec_train_X = vec.fit_transform(X[prune_mask])
    vec_validation = vec.transform(v_X)

    clf = MultinomialNB(**{'alpha': 0.001})
    clf.fit(vec_train_X, y[prune_mask])

    train_prob = clf.predict_proba(vec_train_X)
    train_max_probs = np.amax(train_prob, axis=1)
    prune_mask = np.argwhere((train_max_probs > 0.5) & (train_max_probs < 1)).T[0]

    clf_preds = clf.predict(vec_validation)
    acc = np.mean(clf_preds == v_y)
    
    print(init_acc, acc, best_acc)
    
    if acc > best_acc:
        best_prune_mask = prune_mask
        best_acc = acc
    
    if acc < init_acc:
        return best_prune_mask
    else:
        return prunor(X, y, v_X, v_y, prune_mask, best_prune_mask, best_acc, init_acc)

In [5]:
vec = TfidfVectorizer(**{'tokenizer': lambda x: x.split(), 'lowercase': False, 'max_df': 0.3}) 
vec_train_X = vec.fit_transform(train_X)
vec_validation = vec.transform(validation_X)

clf = MultinomialNB(**{'alpha': 0.001})
clf.fit(vec_train_X, train_y)

train_prob = clf.predict_proba(vec_train_X)
train_max_probs = np.amax(train_prob, axis=1)
prune_mask = np.argwhere((train_max_probs > 0.5) & (train_max_probs < 1)).T[0]

clf_preds = clf.predict(vec_validation)
acc = np.mean(clf_preds == validation_y)

print(acc)

0.6900387834251888


In [10]:
init_prune_mask = np.ones(len(train_y), dtype=bool)

best_prune_mask = prunor(train_X, train_y, validation_X, validation_y, init_prune_mask, init_prune_mask, acc, acc)

0.6900387834251888 0.6900387834251888 0.6900387834251888
0.6900387834251888 0.69034496836089 0.6900387834251888
0.6900387834251888 0.6902429067156562 0.69034496836089
0.6900387834251888 0.6905490916513575 0.69034496836089
0.6900387834251888 0.6909573382322923 0.6905490916513575
0.6900387834251888 0.6904470300061237 0.6909573382322923
0.6900387834251888 0.6902429067156562 0.6909573382322923
0.6900387834251888 0.6906511532965911 0.6909573382322923
0.6900387834251888 0.6905490916513575 0.6909573382322923
0.6900387834251888 0.6909573382322923 0.6909573382322923
0.6900387834251888 0.6902429067156562 0.6909573382322923
0.6900387834251888 0.6906511532965911 0.6909573382322923
0.6900387834251888 0.6911614615227597 0.6909573382322923
0.6900387834251888 0.6901408450704225 0.6911614615227597
0.6900387834251888 0.6913655848132272 0.6911614615227597
0.6900387834251888 0.6904470300061237 0.6913655848132272
0.6900387834251888 0.6910593998775261 0.6913655848132272
0.6900387834251888 0.6913655848132272

array([    0,     1,     2, ..., 19159, 19160, 19161], dtype=int64)

In [11]:
testy = np.array([0,1,2,3,4])
testy2 = testy[[0,2,3]]

testy2

array([0, 2, 3])

In [114]:
for i in range(50):
    vec = TfidfVectorizer(**{'tokenizer': lambda x: x.split(), 'lowercase': False, 'max_df': 0.3}) 
    vec_train_X = vec.fit_transform(train_X[prune_mask])
    vec_validation = vec.transform(validation_X)

    clf = MultinomialNB(**{'alpha': 0.001})
    clf.fit(vec_train_X, train_y[prune_mask])

    train_prob = clf.predict_proba(vec_train_X)
    train_max_probs = np.amax(train_prob, axis=1)
    prune_mask = np.argwhere((train_max_probs > 0.5) & (train_max_probs < 1)).T[0]

    clf_preds = clf.predict(vec_validation)
    acc = np.mean(clf_preds == validation_y)

    print(i, acc)

0 0.69034496836089
1 0.6902429067156562
2 0.6905490916513575
3 0.6909573382322923
4 0.6904470300061237
5 0.6902429067156562
6 0.6906511532965911
7 0.6905490916513575
8 0.6909573382322923
9 0.6902429067156562
10 0.6906511532965911
11 0.6911614615227597
12 0.6901408450704225
13 0.6913655848132272
14 0.6904470300061237
15 0.6910593998775261
16 0.6913655848132272
17 0.6914676464584609
18 0.6914676464584609
19 0.6910593998775261
20 0.6906511532965911
21 0.6905490916513575
22 0.6899367217799551


KeyboardInterrupt: 

In [112]:
prune_mask.shape

(18388,)