In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline

In [2]:
%run query_features.py
%run scripts/helper.py
%run scripts/model_train_plus_test.py

In [3]:
crowd_train = load_file('./data/train.csv/train.csv', None)
crowd_test = load_file('./data/test.csv/test.csv', None)

In [4]:
def get_stable_data():
    return crowd_train[crowd_train.relevance_variance < 0.5]

def get_unstable_data():
    return crowd_train[crowd_train.relevance_variance >= 0.5]

In [5]:
def get_stable_data_targets():
    return crowd_train[crowd_train.relevance_variance < 0.5].median_relevance

def get_unstable_data_targets():
    return crowd_train[crowd_train.relevance_variance >= 0.5].median_relevance

In [6]:
def shuffle_split_indices(target, train_size):
    return ssSplit(target, train_size=train_size, random_state=44)

In [29]:
def get_train_test(shuffle_split=False, stable=True):
    if shuffle_split:
        if stable:
            stable_data = get_stable_data()
            stable_target = get_stable_data_targets()
            train_index, test_index = shuffle_split_indices(stable_target, 500)
            
            Xt = stable_data.iloc[train_index]
            yt = stable_target[train_index]
            
            Xv = stable_data.iloc[test_index]
            yv = stable_target[test_index]
            
            return [(Xt, yt), (Xv, yv)]
        else:
            unstable_data = get_unstable_data()
            unstable_target = get_unstable_data_targets()
            train_index, test_index = shuffle_split_indices(unstable_target, 1000)
            
            Xt = unstable_data.iloc[train_index]
            yt = unstable_target[train_index]
            
            Xv = unstable_data.iloc[test_index]
            yv = unstable_target[test_index]
            
            
            return [(Xt, yt), (Xv, yv)]
    else:
        if stable:
            stable_data = get_stable_data()
            stable_target = get_stable_data_targets()
            Xt = stable_data
            yt = stable_target
            
            Xv = crowd_test
            
            return [(Xt, yt), (Xv)]
        else:
            unstable_data = get_unstable_data()
            unstable_target = get_unstable_data_targets()
            Xt = unstable_data
            yt = unstable_target
            
            Xv = crowd_test
            
            return [(Xt, yt), (Xv)]

In [30]:
def preprocessText(X):
    return tweak_text(X)

In [31]:
def preprocess_data(Xt, Xv):
    return (preprocessText(Xt), preprocessText(Xv))

In [32]:
def bag_of_words(X):
    Xfitted, tfv = TFIDF(X, None)
    return (Xfitted, tfv)

In [33]:
def fit_model(Xt, yt, Xt_fitted):
    svd = TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
    scl = StandardScaler(copy=True, with_mean=True, with_std=True)
    clf = SVC(C=10.0, kernel='rbf', degree=3, 
            gamma=0.0, coef0=0.0, shrinking=True, probability=False, 
            tol=0.001, cache_size=200, class_weight=None, 
            verbose=False, max_iter=-1, random_state=None)

    keywords = keyword_counter(Xt)
    features = stack([keywords, Xt_fitted])
    pipeline = Pipeline([('svd', svd), ('scl', scl), ('clf', clf)])
    pipeline.fit(features, yt)
    
    return pipeline

In [34]:
def predict_model(Xv, Xv_tweaked, tfv, pipeline):
    keywords_test = keyword_counter(Xv)
    Xtest = tfv.transform(Xv_tweaked)
    features_test = stack([keywords_test, Xtest])
    preds_new_model = pipeline.predict(features_test)
    
    return preds_new_model

In [47]:
def model_preparation(shuffle, stable):
    if shuffle:
        (Xt, yt), (Xv, yv) = get_train_test(shuffle, stable)
    else:
        (Xt, yt), (Xv) = get_train_test(shuffle, stable)
    
    Xt_processed, Xv_processed = preprocess_data(Xt, Xv)
    
    print 'Length of the training set %d and test set %d ' %(len(Xt_processed), len(Xv_processed))
    
    Xt_fitted, tfv = bag_of_words(Xt_processed)
    model = fit_model(Xt, yt, Xt_fitted)
    
    model_predictions = predict_model(Xv, Xv_processed, tfv, model)
    
    if shuffle:
        return (model_predictions, yv)
    else:
        return (model_predictions)

In [49]:
shuffle_stable_predictions, yv = model_preparation(True, True)

Length of the training set 500 and test set 780 


In [51]:
shuffle_stable_predictions

array([  4.,   1.,   4.,   4.,   4.,   3.,   4.,   2.,   1.,   4.,   4.,
         4.,   2.,   4.,   4.,   4.,   4.,   4.,   2.,   2.,   2.,   2.,
         4.,  nan,   4.,   4.,   4.,   4.,   4.,   3.,   3.,   4.,   2.,
         4.,   4.,   4.,   4.,   3.,   4.,   4.,   4.,   1.,   2.,   4.,
         4.,   4.,   4.,  nan,   4.,   4.,   4.,   4.,   4.,   4.,   4.,
         3.,   4.,   2.,   4.,   4.,   3.,   4.,   4.,   4.,   4.,   4.,
         4.,   2.,   4.,   1.,   1.,   4.,   4.,   4.,   2.,   4.,  nan,
         4.,   4.,   4.,   4.,   4.,   4.,   4.,   2.,   3.,   4.,   1.,
         1.,   2.,   4.,   4.,   4.,  nan,   4.,   4.,   4.,   4.,   4.,
        nan,  nan,   4.,   4.,   2.,   3.,   4.,   4.,   4.,   4.,   4.,
         4.,   2.,  nan,   2.,   4.,   4.,   2.,  nan,  nan,   4.,   2.,
         4.,   4.,   4.,   3.,   2.,   1.,   1.,   3.,   4.,   4.,   4.,
         1.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,  nan,   4.,
         4.,   4.,   4.,   4.,   4.,   4.,  nan,   

In [None]:
print quadratic_weighted_kappa(get)