In [1]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from matplotlib import pyplot as plt
from stopwordsiso import stopwords
import re
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from scipy.stats import boxcox
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.arima.model import ARIMA
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [2]:
df1 = pd.read_pickle('data/Official_Provincial_Weibo_From_20191201_To_20200816.pickle')
df2 = pd.read_excel('data/0820.xlsx', engine='openpyxl')

In [3]:
df2.columns = df2.columns.str.lower()
df2['date'] = pd.to_datetime(df2.time_var,format='%Y%m%d')
df2 = df2.rename(columns = {'share':'forward'})
df2.province = df2.province.str.replace('\'','')

In [4]:
df1_sc = ['province', 'content', 'date', 'like', 'comment', 'forward']
df2_sc = ['province', 'like', 'forward', 'comment', 'freq', 'conf', 'susp', 'cure', 'dead', 'pop', 'gdp', 'followers', 'date']

In [5]:
df = df1.loc[:, df1_sc].merge(df2.loc[:, df2_sc], on = ['province', 'date', 'like', 'comment', 'forward'], how = 'inner')

In [6]:
scaler = MinMaxScaler()
def transformFeature(feature):
    featureP = boxcox((df[feature] + 1) / df.followers, 0)
    scaled_feture= np.squeeze(scaler.fit_transform(featureP.reshape(featureP.shape[0],1)))
    return scaled_feture

In [7]:
df['likeP'] = transformFeature('like')
df['commentP'] = transformFeature('comment')
df['forwardP'] = transformFeature('forward')

In [8]:
df['content_re'] = df.content.apply(lambda x: re.sub(u"([^\u4e00-\u9fa5])","",x))
corpus = " ".join(jieba.cut(','.join(df.content_re)))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/9m/b3fzzk753v9cswnwn8z8j7b80000gn/T/jieba.cache
Loading model cost 0.606 seconds.
Prefix dict has been built successfully.


## Static like/comment/forward prediction

In [9]:
X = np.concatenate((np.expand_dims(df.content_re.values, axis=1), 
                    pd.get_dummies(df.province).values), axis=1)
y = df.likeP

In [10]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
vectorizer = TfidfVectorizer(tokenizer=jieba.cut, 
                             stop_words=stopwords(["zh"]) | set([w for w in corpus if len(w) == 1]) | set(['借傥', '唷', '啷']), 
                             max_features = 100,
                             max_df = 0.9,
                             use_idf = False)
scaler = StandardScaler()

vectorized_content_train = vectorizer.fit_transform(X_train[:,0])
scaled_content_train = scaler.fit_transform(vectorized_content_train.todense())
vectorized_content_test = vectorizer.transform(X_test[:,0])
scaled_content_test = scaler.transform(vectorized_content_test.todense())

In [12]:
def evaluate_engagement(models, features):
    X = np.concatenate((np.expand_dims(df.content_re.values, axis=1), pd.get_dummies(df.province).values), axis=1)
    for model in models:
        for feature in features:
            y = df[feature + 'P']
            X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)
            gs = GridSearchCV(estimator=models[model]['model'],
                              param_grid=models[model]['params'],
                              scoring='neg_mean_squared_error',
                              n_jobs=6,
                              cv=5,
                              verbose=1)
            start = time.time()
            gs.fit(np.concatenate((scaled_content_train, X_train[:,1:]), axis=1), y_train) 
            end = time.time()
            print('Time to train model: %0.2fs' % (end -start))
            best_model = gs.best_estimator_
            y_pred = best_model.predict(np.concatenate((scaled_content_test, X_test[:,1:]), axis=1))
            print('label: {}, model: {}, MSE: {}'.format(feature, model, np.sqrt(mean_squared_error(y_test, y_pred))))

In [13]:
models = {'SGDRegressor': {'model': SGDRegressor(loss='squared_loss', penalty='l2', random_state=42, max_iter=100),
                           'params': {'penalty':['none','l2','l1'],'alpha':[1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 0.1]}},
          'RandomForestRegressor': {'model':RandomForestRegressor(random_state=42),
                                    'params':{'n_estimators':[100,200],'max_depth':[10,20,50,100]}},
          'MLPRegressor': {'model': MLPRegressor(random_state=42, max_iter=500, early_stopping = True, verbose = 1),
                           'params':{'hidden_layer_sizes':[(100,50,10), (100,100,100)],}}}

In [14]:
evaluate_engagement(models, ['like', 'comment', 'forward'])

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    7.6s
[Parallel(n_jobs=6)]: Done 105 out of 105 | elapsed:   19.6s finished


Time to train model: 19.84s
label: like, model: SGDRegressor, MSE: 0.06757972062508834
Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.6s
[Parallel(n_jobs=6)]: Done 105 out of 105 | elapsed:   17.0s finished


Time to train model: 17.15s
label: comment, model: SGDRegressor, MSE: 0.04335563881282568
Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.2s
[Parallel(n_jobs=6)]: Done 105 out of 105 | elapsed:   16.8s finished


Time to train model: 17.02s
label: forward, model: SGDRegressor, MSE: 0.04335209361847269
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:  3.9min finished


Time to train model: 269.81s
label: like, model: RandomForestRegressor, MSE: 0.056190339437944326
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:  3.3min finished


Time to train model: 228.99s
label: comment, model: RandomForestRegressor, MSE: 0.03476865395096243
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  40 out of  40 | elapsed:  3.2min finished


Time to train model: 227.03s
label: forward, model: RandomForestRegressor, MSE: 0.03460261653116925
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   21.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   21.0s finished


Iteration 1, loss = 0.00954109
Validation score: -0.351581
Iteration 2, loss = 0.00464073
Validation score: 0.151633
Iteration 3, loss = 0.00322088
Validation score: 0.536519
Iteration 4, loss = 0.00242619
Validation score: 0.509685
Iteration 5, loss = 0.00213551
Validation score: 0.565797
Iteration 6, loss = 0.00192414
Validation score: 0.586140
Iteration 7, loss = 0.00218813
Validation score: 0.565317
Iteration 8, loss = 0.00186763
Validation score: 0.573469
Iteration 9, loss = 0.00206212
Validation score: 0.544795
Iteration 10, loss = 0.00200077
Validation score: 0.570552
Iteration 11, loss = 0.00177571
Validation score: 0.580309
Iteration 12, loss = 0.00170364
Validation score: 0.623995
Iteration 13, loss = 0.00170102
Validation score: 0.580042
Iteration 14, loss = 0.00162668
Validation score: 0.615093
Iteration 15, loss = 0.00166051
Validation score: 0.540659
Iteration 16, loss = 0.00160061
Validation score: 0.571548
Iteration 17, loss = 0.00171015
Validation score: 0.585995
Itera

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   25.2s remaining:    0.0s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   25.2s finished


Iteration 1, loss = 0.00856606
Validation score: -0.047836
Iteration 2, loss = 0.00364084
Validation score: 0.269082
Iteration 3, loss = 0.00212257
Validation score: 0.725253
Iteration 4, loss = 0.00139073
Validation score: 0.735812
Iteration 5, loss = 0.00110569
Validation score: 0.801493
Iteration 6, loss = 0.00089382
Validation score: 0.802563
Iteration 7, loss = 0.00171463
Validation score: 0.738686
Iteration 8, loss = 0.00105020
Validation score: 0.773989
Iteration 9, loss = 0.00084914
Validation score: 0.828634
Iteration 10, loss = 0.00091665
Validation score: 0.809990
Iteration 11, loss = 0.00072545
Validation score: 0.843505
Iteration 12, loss = 0.00086229
Validation score: 0.841304
Iteration 13, loss = 0.00078235
Validation score: 0.847980
Iteration 14, loss = 0.00072838
Validation score: 0.831316
Iteration 15, loss = 0.00065896
Validation score: 0.861016
Iteration 16, loss = 0.00058434
Validation score: 0.859954
Iteration 17, loss = 0.00066331
Validation score: 0.846836
Itera

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   26.3s remaining:    0.0s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:   26.3s finished


Iteration 1, loss = 0.00855778
Validation score: -0.073549
Iteration 2, loss = 0.00373459
Validation score: 0.349145
Iteration 3, loss = 0.00211250
Validation score: 0.733881
Iteration 4, loss = 0.00141292
Validation score: 0.760948
Iteration 5, loss = 0.00109642
Validation score: 0.809544
Iteration 6, loss = 0.00085750
Validation score: 0.831994
Iteration 7, loss = 0.00092987
Validation score: 0.735488
Iteration 8, loss = 0.00088916
Validation score: 0.759261
Iteration 9, loss = 0.00103788
Validation score: 0.734401
Iteration 10, loss = 0.00084268
Validation score: 0.784615
Iteration 11, loss = 0.00082262
Validation score: 0.801149
Iteration 12, loss = 0.00078879
Validation score: 0.855908
Iteration 13, loss = 0.00074066
Validation score: 0.860010
Iteration 14, loss = 0.00070855
Validation score: 0.845081
Iteration 15, loss = 0.00067917
Validation score: 0.847518
Iteration 16, loss = 0.00063765
Validation score: 0.864447
Iteration 17, loss = 0.00079994
Validation score: 0.847957
Itera

## Dynamic like/comment/forward prediction

In [15]:
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit()
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    mse = mean_squared_error(test, predictions)
    
    return mse

    # evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(features, p_values, d_values, q_values):
    for feature in features:
        dataset = df.groupby('date')[feature + 'P'].mean().values.astype('float32')
        best_score, best_cfg = float("inf"), None
        for p in p_values:
            for d in d_values:
                for q in q_values:
                    order = (p,d,q)
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA: %s MSE=%.3f' % (order,mse))
                    
        print('Label: %s Best ARIM: %s MSE=%.3f' % (feature, best_cfg, best_score))


In [16]:
p_values = [0, 1, 2, 4, 6, 8, 10, 15, 30]
d_values = range(0, 5)
q_values = range(0, 5)

evaluate_models(['like', 'comment', 'forward'], p_values, d_values, q_values)



ARIMA: (0, 0, 0) MSE=0.004




ARIMA: (0, 0, 1) MSE=0.002




ARIMA: (0, 0, 2) MSE=0.001
ARIMA: (0, 0, 3) MSE=0.001
ARIMA: (0, 0, 4) MSE=0.001




ARIMA: (0, 1, 0) MSE=0.000




ARIMA: (0, 1, 1) MSE=0.000
ARIMA: (0, 1, 2) MSE=0.000




ARIMA: (0, 1, 3) MSE=0.000




ARIMA: (0, 1, 4) MSE=0.000




ARIMA: (0, 2, 0) MSE=0.001




ARIMA: (0, 2, 1) MSE=0.000




ARIMA: (0, 2, 2) MSE=0.000






ARIMA: (0, 2, 3) MSE=0.000






ARIMA: (0, 2, 4) MSE=0.000




ARIMA: (0, 3, 0) MSE=0.004
ARIMA: (0, 3, 1) MSE=0.001




ARIMA: (0, 3, 2) MSE=0.000




ARIMA: (0, 3, 3) MSE=0.000




ARIMA: (0, 3, 4) MSE=0.000




ARIMA: (0, 4, 0) MSE=0.015




ARIMA: (0, 4, 1) MSE=0.004




ARIMA: (0, 4, 2) MSE=0.001




ARIMA: (0, 4, 3) MSE=0.001




ARIMA: (0, 4, 4) MSE=0.000




ARIMA: (1, 0, 0) MSE=0.001
ARIMA: (1, 0, 1) MSE=0.000




ARIMA: (1, 0, 2) MSE=0.000




ARIMA: (1, 0, 3) MSE=0.000




ARIMA: (1, 0, 4) MSE=0.000




ARIMA: (1, 1, 0) MSE=0.000




ARIMA: (1, 1, 1) MSE=0.000
ARIMA: (1, 1, 2) MSE=0.000






ARIMA: (1, 1, 3) MSE=0.000




ARIMA: (1, 1, 4) MSE=0.000




ARIMA: (1, 2, 0) MSE=0.001




ARIMA: (1, 2, 1) MSE=0.000






ARIMA: (1, 2, 2) MSE=0.000




ARIMA: (1, 2, 3) MSE=0.000




ARIMA: (1, 2, 4) MSE=0.000




ARIMA: (1, 3, 0) MSE=0.002




ARIMA: (1, 3, 1) MSE=0.001




ARIMA: (1, 3, 2) MSE=0.000






ARIMA: (1, 3, 3) MSE=0.000




ARIMA: (1, 3, 4) MSE=0.000




ARIMA: (1, 4, 0) MSE=0.006




ARIMA: (1, 4, 1) MSE=0.002




ARIMA: (1, 4, 2) MSE=0.001




ARIMA: (1, 4, 3) MSE=0.001




ARIMA: (1, 4, 4) MSE=0.000




ARIMA: (2, 0, 0) MSE=0.000






ARIMA: (2, 0, 1) MSE=0.000






ARIMA: (2, 0, 2) MSE=0.000






ARIMA: (2, 0, 3) MSE=0.000






ARIMA: (2, 0, 4) MSE=0.000
ARIMA: (2, 1, 0) MSE=0.000




ARIMA: (2, 1, 1) MSE=0.000
ARIMA: (2, 1, 2) MSE=0.000






ARIMA: (2, 1, 3) MSE=0.000








ARIMA: (2, 1, 4) MSE=0.000




ARIMA: (2, 2, 0) MSE=0.001






ARIMA: (2, 2, 1) MSE=0.000




ARIMA: (2, 2, 2) MSE=0.000






ARIMA: (2, 2, 3) MSE=0.000






ARIMA: (2, 2, 4) MSE=0.000




ARIMA: (2, 3, 0) MSE=0.001




ARIMA: (2, 3, 1) MSE=0.001






ARIMA: (2, 3, 2) MSE=0.000




ARIMA: (2, 3, 3) MSE=0.000






ARIMA: (2, 3, 4) MSE=0.001




ARIMA: (2, 4, 0) MSE=0.003




ARIMA: (2, 4, 1) MSE=0.001






ARIMA: (2, 4, 2) MSE=0.001




ARIMA: (2, 4, 3) MSE=0.001




ARIMA: (2, 4, 4) MSE=0.000




ARIMA: (4, 0, 0) MSE=0.000






ARIMA: (4, 0, 1) MSE=0.000






ARIMA: (4, 0, 2) MSE=0.000








ARIMA: (4, 0, 3) MSE=0.000








ARIMA: (4, 0, 4) MSE=0.000
ARIMA: (4, 1, 0) MSE=0.000






ARIMA: (4, 1, 1) MSE=0.000






ARIMA: (4, 1, 2) MSE=0.000






ARIMA: (4, 1, 3) MSE=0.000






ARIMA: (4, 1, 4) MSE=0.000
ARIMA: (4, 2, 0) MSE=0.000






ARIMA: (4, 2, 1) MSE=0.000






ARIMA: (4, 2, 2) MSE=0.000






ARIMA: (4, 2, 3) MSE=0.000








ARIMA: (4, 2, 4) MSE=0.000




ARIMA: (4, 3, 0) MSE=0.001






ARIMA: (4, 3, 1) MSE=0.000






ARIMA: (4, 3, 2) MSE=0.000






ARIMA: (4, 3, 3) MSE=0.000






ARIMA: (4, 3, 4) MSE=0.000




ARIMA: (4, 4, 0) MSE=0.002






ARIMA: (4, 4, 1) MSE=0.001






ARIMA: (4, 4, 2) MSE=0.001






ARIMA: (4, 4, 3) MSE=0.001






ARIMA: (4, 4, 4) MSE=0.001




ARIMA: (6, 0, 0) MSE=0.000








ARIMA: (6, 0, 1) MSE=0.000








ARIMA: (6, 0, 2) MSE=0.000








ARIMA: (6, 0, 3) MSE=0.000








ARIMA: (6, 0, 4) MSE=0.000




ARIMA: (6, 1, 0) MSE=0.000






ARIMA: (6, 1, 1) MSE=0.000








ARIMA: (6, 1, 2) MSE=0.000








ARIMA: (6, 1, 3) MSE=0.000








ARIMA: (6, 1, 4) MSE=0.000




ARIMA: (6, 2, 0) MSE=0.000






ARIMA: (6, 2, 1) MSE=0.000








ARIMA: (6, 2, 2) MSE=0.000






ARIMA: (6, 2, 3) MSE=0.000








ARIMA: (6, 2, 4) MSE=0.000




ARIMA: (6, 3, 0) MSE=0.001






ARIMA: (6, 3, 1) MSE=0.000






ARIMA: (6, 3, 2) MSE=0.000






ARIMA: (6, 3, 3) MSE=0.000








ARIMA: (6, 3, 4) MSE=0.000




ARIMA: (6, 4, 0) MSE=0.001






ARIMA: (6, 4, 1) MSE=0.001






ARIMA: (6, 4, 2) MSE=0.001






ARIMA: (6, 4, 3) MSE=0.001






ARIMA: (6, 4, 4) MSE=0.000




ARIMA: (8, 0, 0) MSE=0.000






ARIMA: (8, 0, 1) MSE=0.000








ARIMA: (8, 0, 2) MSE=0.000








ARIMA: (8, 0, 3) MSE=0.000








ARIMA: (8, 0, 4) MSE=0.000




ARIMA: (8, 1, 0) MSE=0.000






ARIMA: (8, 1, 1) MSE=0.000






ARIMA: (8, 1, 2) MSE=0.000








ARIMA: (8, 1, 3) MSE=0.000








ARIMA: (8, 1, 4) MSE=0.000




ARIMA: (8, 2, 0) MSE=0.000








ARIMA: (8, 2, 1) MSE=0.000








ARIMA: (8, 2, 2) MSE=0.000








ARIMA: (8, 2, 3) MSE=0.000








ARIMA: (8, 2, 4) MSE=0.000




ARIMA: (8, 3, 0) MSE=0.001








ARIMA: (8, 3, 1) MSE=0.000








ARIMA: (8, 3, 2) MSE=0.000








ARIMA: (8, 3, 3) MSE=0.000








ARIMA: (8, 3, 4) MSE=0.001




ARIMA: (8, 4, 0) MSE=0.001








ARIMA: (8, 4, 1) MSE=0.001








ARIMA: (8, 4, 2) MSE=0.001








ARIMA: (8, 4, 3) MSE=0.001






ARIMA: (8, 4, 4) MSE=0.001






ARIMA: (10, 0, 0) MSE=0.000






ARIMA: (10, 0, 1) MSE=0.000








ARIMA: (10, 0, 2) MSE=0.000








ARIMA: (10, 0, 3) MSE=0.000








ARIMA: (10, 0, 4) MSE=0.000
ARIMA: (10, 1, 0) MSE=0.000




ARIMA: (10, 1, 1) MSE=0.000






ARIMA: (10, 1, 2) MSE=0.000






ARIMA: (10, 1, 3) MSE=0.000






ARIMA: (10, 1, 4) MSE=0.000
ARIMA: (10, 2, 0) MSE=0.000








ARIMA: (10, 2, 1) MSE=0.000








ARIMA: (10, 2, 2) MSE=0.000








ARIMA: (10, 2, 3) MSE=0.000






ARIMA: (10, 2, 4) MSE=0.000




ARIMA: (10, 3, 0) MSE=0.001








ARIMA: (10, 3, 1) MSE=0.000








ARIMA: (10, 3, 2) MSE=0.001








ARIMA: (10, 3, 3) MSE=0.000








ARIMA: (10, 3, 4) MSE=0.000
ARIMA: (10, 4, 0) MSE=0.001








ARIMA: (10, 4, 1) MSE=0.001








ARIMA: (10, 4, 2) MSE=0.001








ARIMA: (10, 4, 3) MSE=0.001








ARIMA: (10, 4, 4) MSE=0.001






ARIMA: (15, 0, 0) MSE=0.000






ARIMA: (15, 0, 1) MSE=0.000






ARIMA: (15, 0, 2) MSE=0.000








ARIMA: (15, 0, 3) MSE=0.000








ARIMA: (15, 0, 4) MSE=0.000
ARIMA: (15, 1, 0) MSE=0.000






ARIMA: (15, 1, 1) MSE=0.000






ARIMA: (15, 1, 2) MSE=0.000








ARIMA: (15, 1, 3) MSE=0.000








ARIMA: (15, 1, 4) MSE=0.000




ARIMA: (15, 2, 0) MSE=0.000








ARIMA: (15, 2, 1) MSE=0.000








ARIMA: (15, 2, 2) MSE=0.000








ARIMA: (15, 2, 3) MSE=0.000








ARIMA: (15, 2, 4) MSE=0.000




ARIMA: (15, 3, 0) MSE=0.000








ARIMA: (15, 3, 1) MSE=0.000








ARIMA: (15, 3, 2) MSE=0.000








ARIMA: (15, 3, 3) MSE=0.000








ARIMA: (15, 3, 4) MSE=0.000




ARIMA: (15, 4, 0) MSE=0.001








ARIMA: (15, 4, 1) MSE=0.001






ARIMA: (15, 4, 2) MSE=0.001








ARIMA: (15, 4, 3) MSE=0.001






ARIMA: (15, 4, 4) MSE=0.001






ARIMA: (30, 0, 0) MSE=0.000








ARIMA: (30, 0, 1) MSE=0.000








ARIMA: (30, 0, 2) MSE=0.000








ARIMA: (30, 0, 3) MSE=0.000








ARIMA: (30, 0, 4) MSE=0.000






ARIMA: (30, 1, 0) MSE=0.000






ARIMA: (30, 1, 1) MSE=0.000






ARIMA: (30, 1, 2) MSE=0.000








ARIMA: (30, 1, 3) MSE=0.000








ARIMA: (30, 1, 4) MSE=0.000






ARIMA: (30, 2, 0) MSE=0.000








ARIMA: (30, 2, 1) MSE=0.000








ARIMA: (30, 2, 2) MSE=0.000








ARIMA: (30, 2, 3) MSE=0.000








ARIMA: (30, 2, 4) MSE=0.000






ARIMA: (30, 3, 0) MSE=0.000








ARIMA: (30, 3, 1) MSE=0.000






ARIMA: (30, 3, 2) MSE=0.000








ARIMA: (30, 3, 3) MSE=0.000






ARIMA: (30, 3, 4) MSE=0.000






ARIMA: (30, 4, 0) MSE=0.001








ARIMA: (30, 4, 1) MSE=0.001






ARIMA: (30, 4, 2) MSE=0.001








ARIMA: (30, 4, 3) MSE=0.001






ARIMA: (30, 4, 4) MSE=0.001
Label: like Best ARIM: (2, 1, 3) MSE=0.000




ARIMA: (0, 0, 0) MSE=0.002




ARIMA: (0, 0, 1) MSE=0.001




ARIMA: (0, 0, 2) MSE=0.001




ARIMA: (0, 0, 3) MSE=0.001




ARIMA: (0, 0, 4) MSE=0.001




ARIMA: (0, 1, 0) MSE=0.000




ARIMA: (0, 1, 1) MSE=0.000




ARIMA: (0, 1, 2) MSE=0.000




ARIMA: (0, 1, 3) MSE=0.000




ARIMA: (0, 1, 4) MSE=0.000




ARIMA: (0, 2, 0) MSE=0.001




ARIMA: (0, 2, 1) MSE=0.000






ARIMA: (0, 2, 2) MSE=0.000






ARIMA: (0, 2, 3) MSE=0.000






ARIMA: (0, 2, 4) MSE=0.000




ARIMA: (0, 3, 0) MSE=0.004




ARIMA: (0, 3, 1) MSE=0.001




ARIMA: (0, 3, 2) MSE=0.000




ARIMA: (0, 3, 3) MSE=0.000




ARIMA: (0, 3, 4) MSE=0.000




ARIMA: (0, 4, 0) MSE=0.016




ARIMA: (0, 4, 1) MSE=0.004




ARIMA: (0, 4, 2) MSE=0.001




ARIMA: (0, 4, 3) MSE=0.001




ARIMA: (0, 4, 4) MSE=0.000




ARIMA: (1, 0, 0) MSE=0.001




ARIMA: (1, 0, 1) MSE=0.000




ARIMA: (1, 0, 2) MSE=0.000






ARIMA: (1, 0, 3) MSE=0.000






ARIMA: (1, 0, 4) MSE=0.000
ARIMA: (1, 1, 0) MSE=0.000
ARIMA: (1, 1, 1) MSE=0.000




ARIMA: (1, 1, 2) MSE=0.000




ARIMA: (1, 1, 3) MSE=0.000






ARIMA: (1, 1, 4) MSE=0.000




ARIMA: (1, 2, 0) MSE=0.001




ARIMA: (1, 2, 1) MSE=0.000






ARIMA: (1, 2, 2) MSE=0.000




ARIMA: (1, 2, 3) MSE=0.000






ARIMA: (1, 2, 4) MSE=0.000




ARIMA: (1, 3, 0) MSE=0.001
ARIMA: (1, 3, 1) MSE=0.001




ARIMA: (1, 3, 2) MSE=0.000




ARIMA: (1, 3, 3) MSE=0.000






ARIMA: (1, 3, 4) MSE=0.000




ARIMA: (1, 4, 0) MSE=0.004




ARIMA: (1, 4, 1) MSE=0.001




ARIMA: (1, 4, 2) MSE=0.001




ARIMA: (1, 4, 3) MSE=0.000




ARIMA: (1, 4, 4) MSE=0.000




ARIMA: (2, 0, 0) MSE=0.000




ARIMA: (2, 0, 1) MSE=0.000






ARIMA: (2, 0, 2) MSE=0.000




ARIMA: (2, 0, 3) MSE=0.000






ARIMA: (2, 0, 4) MSE=0.000




ARIMA: (2, 1, 0) MSE=0.000




ARIMA: (2, 1, 1) MSE=0.000




ARIMA: (2, 1, 2) MSE=0.000




ARIMA: (2, 1, 3) MSE=0.000






ARIMA: (2, 1, 4) MSE=0.000




ARIMA: (2, 2, 0) MSE=0.000






ARIMA: (2, 2, 1) MSE=0.000




ARIMA: (2, 2, 2) MSE=0.000






ARIMA: (2, 2, 3) MSE=0.000






ARIMA: (2, 2, 4) MSE=0.000




ARIMA: (2, 3, 0) MSE=0.001




ARIMA: (2, 3, 1) MSE=0.000




ARIMA: (2, 3, 2) MSE=0.000




ARIMA: (2, 3, 3) MSE=0.000






ARIMA: (2, 3, 4) MSE=0.000




ARIMA: (2, 4, 0) MSE=0.002




ARIMA: (2, 4, 1) MSE=0.001






ARIMA: (2, 4, 2) MSE=0.000




ARIMA: (2, 4, 3) MSE=0.001




ARIMA: (2, 4, 4) MSE=0.000




ARIMA: (4, 0, 0) MSE=0.000






ARIMA: (4, 0, 1) MSE=0.000






ARIMA: (4, 0, 2) MSE=0.000






ARIMA: (4, 0, 3) MSE=0.000






ARIMA: (4, 0, 4) MSE=0.000
ARIMA: (4, 1, 0) MSE=0.000






ARIMA: (4, 1, 1) MSE=0.000






ARIMA: (4, 1, 2) MSE=0.000






ARIMA: (4, 1, 3) MSE=0.000








ARIMA: (4, 1, 4) MSE=0.000




ARIMA: (4, 2, 0) MSE=0.000






ARIMA: (4, 2, 1) MSE=0.000






ARIMA: (4, 2, 2) MSE=0.000








ARIMA: (4, 2, 3) MSE=0.000








ARIMA: (4, 2, 4) MSE=0.000




ARIMA: (4, 3, 0) MSE=0.001






ARIMA: (4, 3, 1) MSE=0.000






ARIMA: (4, 3, 2) MSE=0.000






ARIMA: (4, 3, 3) MSE=0.000






ARIMA: (4, 3, 4) MSE=0.000




ARIMA: (4, 4, 0) MSE=0.001






ARIMA: (4, 4, 1) MSE=0.001






ARIMA: (4, 4, 2) MSE=0.001






ARIMA: (4, 4, 3) MSE=0.000






ARIMA: (4, 4, 4) MSE=0.000




ARIMA: (6, 0, 0) MSE=0.000




ARIMA: (6, 0, 1) MSE=0.000






ARIMA: (6, 0, 2) MSE=0.000








ARIMA: (6, 0, 3) MSE=0.000






ARIMA: (6, 0, 4) MSE=0.000




ARIMA: (6, 1, 0) MSE=0.000








ARIMA: (6, 1, 1) MSE=0.000






ARIMA: (6, 1, 2) MSE=0.000






ARIMA: (6, 1, 3) MSE=0.000






ARIMA: (6, 1, 4) MSE=0.000
ARIMA: (6, 2, 0) MSE=0.000






ARIMA: (6, 2, 1) MSE=0.000








ARIMA: (6, 2, 2) MSE=0.000








ARIMA: (6, 2, 3) MSE=0.000






ARIMA: (6, 2, 4) MSE=0.000
ARIMA: (6, 3, 0) MSE=0.000








ARIMA: (6, 3, 1) MSE=0.000






ARIMA: (6, 3, 2) MSE=0.000








ARIMA: (6, 3, 3) MSE=0.000




LinAlgError: LU decomposition error.

In [None]:
train_size = int(len(X) * 0.66)
train, test = X[0:train_size], X[train_size:]

In [None]:
mlp_df = pd.concat([df.groupby('date').content_re.sum(), 
                     df.groupby('date').likeP.mean(),
                     df.groupby('date').commentP.mean(),
                     df.groupby('date').forwardP.mean()], axis = 1).reset_index()

In [None]:
size = int(len(lstm_df) * 0.8)

In [None]:
vectorizer = TfidfVectorizer(tokenizer=jieba.cut, 
                             stop_words=stopwords(["zh"]) | set([w for w in corpus if len(w) == 1]) | set(['借傥', '唷', '啷']), 
                             max_features = 20,
                             max_df = 0.9,
                             use_idf = False)
scaler = StandardScaler()

vectorized_content_train = vectorizer.fit_transform(mlp_df.content_re[:size])
scaled_content_train = scaler.fit_transform(vectorized_content_train.todense())
vectorized_content = vectorizer.transform(mlp_df.content_re)
scaled_content = scaler.transform(vectorized_content.todense())

In [None]:
def split_sequence(sequence, n_step):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_step
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = np.concatenate((sequence[i:end_ix,1:].flatten(), sequence[i:end_ix - 1,0])), sequence[end_ix, [0]]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
def train_model(x_train, y_train, x_test, y_test, n_feature):
    model = Sequential()
    model.add(Dense(n_feature, activation='relu', input_shape=(n_feature,)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    model.fit(x_train, y_train, epochs=200, verbose=0)
    predictions = model.predict(x_test, verbose=1)
    mse = mean_squared_error(y_test, predictions)
    return mse

In [None]:
def evaluate_models(features, n_steps):
    for feature in features:
        X = np.concatenate((np.expand_dims(mlp_df[feature + 'P'].values, 1), scaled_content), axis = 1)
        train, test = X[0:size], X[size:len(X)]
        best_score, best_cfg = float("inf"), None
        for n_step in n_steps:
            n_feature = n_step * 20 + n_step - 1
            X_train, y_train = split_sequence(train, n_step)
            X_test, y_test = split_sequence(test, n_step)
            mse = train_model(X_train, y_train, X_test, y_test, n_feature)
            order = n_step
            if mse < best_score:
                best_score, best_cfg = mse, order
            #print('MLP: %s MSE=%.3f' % (n_step, mse))
        print()
        print('Label: %s Best MLP: %s MSE=%.3f' % (feature, best_cfg, best_score))
        print()

In [None]:
evaluate_models(['like', 'comment', 'forward'], [1, 2, 5])