In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

def get_external_data(file, train_size, val_size):
    data = pd.read_feather(file)
    X = data['text'].values.tolist()
    y = [len(x) > 0 for x in data['positions']]
    
    return train_test_split(X, y, stratify=y, train_size=train_size, test_size=val_size, random_state=2)

In [2]:
TRAINING_EXTERNAL_FILE = '../data/feather/external_feather'
train_x, validation_x, train_y, validation_y = get_external_data(TRAINING_EXTERNAL_FILE, 3000, 1500)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

max_features= 300000

print('Fitting w/ test...')
train_text = train_x

all_text = train_x + validation_x

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=max_features
)

word_vectorizer.fit(all_text)

train_word_features = word_vectorizer.transform(train_text)


char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=max_features
)

char_vectorizer.fit(all_text)

train_char_features = char_vectorizer.transform(train_text)

feature_names = char_vectorizer.get_feature_names() + word_vectorizer.get_feature_names()
train_features = hstack([train_char_features, train_word_features])

Fitting w/ test...


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import numpy as np

logreg_model = LogisticRegression(C=2.0, solver='sag')

sfm = SelectFromModel(logreg_model, threshold=0.3)

train_target = np.array(train_y)
train_sparse_matrix = sfm.fit_transform(train_features, train_target)
print('Coef shape:', sfm.estimator_.coef_.shape)
print('Coef:', sfm.estimator_.coef_)



Coef shape: (1, 600000)
Coef: [[-0.10787444  0.02737816 -0.00305356 ... -0.02152994  0.
   0.        ]]


In [9]:
coef = sfm.estimator_.coef_.reshape(-1)

sorted_features = sorted(zip(map(lambda x: round(x, 4), coef), 
         feature_names), reverse=True)

In [10]:
best = [f for f in sorted_features if f[0]>=0.3]
len(best)

530

In [11]:
best

[(0.8485, 'this movie'),
 (0.8441, 'begins'),
 (0.8368, 'version'),
 (0.7964, 'find'),
 (0.7313, 'reviews'),
 (0.7035, 'movie'),
 (0.6797, 'become'),
 (0.6675, 'young'),
 (0.6613, 'i was'),
 (0.65, 'first'),
 (0.6474, 'two'),
 (0.6458, 'saw'),
 (0.6384, 'people'),
 (0.6251, 'often'),
 (0.625, 'father'),
 (0.6227, 'years'),
 (0.622, 'to find'),
 (0.6177, 'review'),
 (0.6135, 'after'),
 (0.6079, 'one of'),
 (0.5892, 'reading'),
 (0.5881, 'off'),
 (0.5822, 'directed'),
 (0.582, 'story of'),
 (0.5762, 'days'),
 (0.5563, 'version of'),
 (0.5543, 'starring'),
 (0.5477, '- '),
 (0.5409, 'new'),
 (0.5403, 'interesting'),
 (0.5329, 'has been'),
 (0.5324, 'known'),
 (0.5295, 'films'),
 (0.5265, 'man'),
 (0.5263, 'has'),
 (0.5263, " '"),
 (0.5217, 'when'),
 (0.5168, 'many of'),
 (0.5154, 'based on'),
 (0.5142, 've'),
 (0.5127, 'being'),
 (0.5125, 'by'),
 (0.5106, 'rock'),
 (0.51, 'call'),
 (0.5086, 'many'),
 (0.5064, 'of his'),
 (0.5033, 'sees'),
 (0.501, 'every'),
 (0.4969, 'on a'),
 (0.4962, 'h