In [3]:
!pip install --upgrade --quiet pip 
!pip install -I --upgrade --quiet setuptools
!pip install --upgrade --quiet pymongo
!pip install --quiet lightgbm

In [None]:
% matplotlib inline
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

In [817]:
import modelling.fetch as fetch
from modelling.models import *
from modelling.utils import get_articles
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from pymongo import MongoClient
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
import lightgbm

In [6]:
collection = MongoClient('209.177.92.45:80')['newsfilter'].news
arts = get_articles(collection, label=True, src='tw', unique=True)

In [7]:
df = create_df(list(arts))

In [32]:
def group_by_ctx(li):
    grouped = {}
    for i in li:
        try:
            nd.concat(grouped[i.context.device_id], i, dim=1)
        except KeyError:
            grouped[i.context.device_id] = i
    return grouped        

In [None]:
from modelling.utils import preprocessor

lookup = [
    ('ge', 'title'),
    ('tw', 'body'),
    ('fa', 'title') 
]

def prepare_df(df, preprocessor, lookup, out_key = 'body'):
    
    # [df[] for i,t in enumerate]
    # unique = pd.concat([df[i].assign(text = df.iloc[i][t[2]]) 
    #                     for i,t in enumerate(lookup)] )
    # unique['text'] = unique.text.map(preprocessor)
    # unique = unique[unique.text.str.len() > 8]
    return unique.rename({ 'text': out_key })

In [None]:
def number(n):
    try:
        float(n)
        return True
    except:
        return False

df[df.cluster.map(number)].sort_values('added')

In [78]:
df['body'] = df.body.map(preprocessor)
df = df.sort_values('added')

In [747]:
tfidf = create_tfidf(df)
word_count = create_word_count(df).astype('float32')

In [158]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(5, shuffle = True)

def evaluate(model, tfidf, df, method = 'predict_proba', quiet = False):
    ans = [1 if l == 'accepted' else 0 for l in df.label]
    probs = cross_val_predict(model, tfidf, ans, cv=cv, method=method)
    predictions = cross_val_predict(model, tfidf, ans, cv=cv, method='predict')

    if method == 'predict_proba':
        probs = [p[1] for p in probs] 

    if not quiet:
        print (average_precision_score(ans, probs), roc_auc_score(ans, probs, average = None))
        print (precision_score(ans, predictions), recall_score(ans, predictions), fbeta_score(ans, predictions, beta = 1.5))
    # precision, recall, _ = precision_recall_curve(ans, guess)
    fpr, tpr, _ = roc_curve(ans, probs)
    plt.step(fpr, tpr)
    # plt.step(precision, recall)
    return probs, ans, predictions

In [None]:
scorer = make_scorer(fbeta_score, beta = 1.5, pos_label = 'accepted')
parameters = [{ }]
grid = GridSearchCV(lightgbm.LGBMClassifier(n_estimators = 100, reg_lambda = 0.6, reg_alpha = 0.2), parameters, scoring = scorer)
# p,a,pr = evaluate(model, tfidf, df)
grid.fit(word_count, df.label)
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')

In [None]:
model = lightgbm.LGBMClassifier(n_estimators = 10, reg_lambda = 0.6, reg_alpha = 0.2)
_ = evaluate(model, word_count, df)

In [None]:
scorer = make_scorer(fbeta_score, beta = 1.5, pos_label = 'accepted')
parameters = [{ }]
model = svm.LinearSVC(dual = False, class_weight = 'balanced', C = 4., tol = 10e-6, max_iter = 4000)
grid = GridSearchCV(model, parameters, scoring = scorer)
# p,a,pr = evaluate(model, tfidf, df)
grid.fit(word_count, df.label)
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')

In [None]:
from sklearn import svm

svc = svm.LinearSVC(dual = False,  tol = 10e-6, max_iter = 10000)
g,a,p = evaluate(svc, tfidf, df, 'decision_function')

In [None]:
nb = MultinomialNB(class_prior = [.4,.6])
g, ans, preds = evaluate(nb, tfidf, df)

In [None]:
from copy import deepcopy

def rolling_cross_val_predict(model, X, y, start, window):
    m = deepcopy(model)
    model.fit()

In [795]:
from datetime import datetime, timedelta

start = datetime.utcnow() - timedelta(weeks = 3)
X, y, df = get_prediction_data(collection, label=True)
X_new, _, _ = get_prediction_data(collection, label=False,start=start)

In [818]:
start = 0
cutoff = 3000
end = df.shape[0]

X_train, y_train = X[start:cutoff], y[start:cutoff]
X_test, y_test = X[cutoff:end], y[cutoff:end]
preds = train_and_predict(X_train, y_train, X_test)

In [819]:
ans = y_test
p = preds > .5
print (precision_score(ans, p), recall_score(ans, p), fbeta_score(ans, p, beta = 1.5))
p.sum(), preds.shape[0]

0.627906976744 0.662576687117 0.65150812065


(172, 667)