In [1]:
import pandas as pd
import numpy as np
import datetime

#import xgboost as xgb

from keras.models import Sequential
from keras.layers import Dense
from keras import layers
from keras.layers import Dropout

import matplotlib.pyplot as plt
from tensorflow import keras

In [2]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [3]:
from transformers import (
   BertTokenizerFast,
   AutoModelForMaskedLM,
   AutoModelForTokenClassification,
)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = AutoModelForMaskedLM.from_pretrained('ckiplab/albert-tiny-chinese') 

In [4]:
#this is the raw dataset
data16 = pd.read_csv(r'/Users/cairo/Google Drive/wechat data/016.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
#data16 = pd.read_csv(r'/Users/Junhao/Google Drive/wechat data/016.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

In [None]:
#topicdata = pd.read_csv(r'C:/Users/Junhao/Google Drive/wechat data/TopicOutcomeAll20Topic.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

In [5]:
#this dataset is the raw data processed with cutting word package jieba(a popular word cutting tool for chinese)
#cutword = pd.read_pickle("C:/Users/Junhao/Google Drive/wechat data/cut001.pkl")
#cutword = pd.read_pickle("/Users/cairo/Google Drive/wechat data/cut001.pkl")

In [5]:
#this is the dataset with topic modeling output
#topicdata = pd.read_csv(r'/Users/cairo/Google Drive/wechat data/TopicOutcomeAll20Topic.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
#topicdata = pd.read_csv(r'C:/Users/Junhao/Google Drive/wechat data/TopicOutcomeAll20Topic.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

In [10]:
list(cutword.columns) 

['name',
 'account',
 'author',
 'type',
 'title',
 'summary',
 'likeCount',
 'clicksCount',
 'url',
 'publicTime',
 'orderNum',
 'originalFlag',
 'imageUrl',
 'sourceUrl',
 'videoUrl',
 'musicUrl',
 'audioUrl',
 'updateTime',
 'insertTime',
 'wordsCut']

In [11]:
list(topicdata.columns)

['name',
 'account',
 'author',
 'type',
 'title',
 'summary',
 'likeCount',
 'clicksCount',
 'url',
 'publicTime',
 'orderNum',
 'originalFlag',
 'imageUrl',
 'sourceUrl',
 'videoUrl',
 'musicUrl',
 'audioUrl',
 'updateTime',
 'insertTime',
 'topic0',
 'topic1',
 'topic2',
 'topic3',
 'topic4',
 'topic5',
 'topic6',
 'topic7',
 'topic8',
 'topic9',
 'topic10',
 'topic11',
 'topic12',
 'topic13',
 'topic14',
 'topic15',
 'topic16',
 'topic17',
 'topic18',
 'topic19']

In [37]:
list(data16.columns)

['name',
 'account',
 'author',
 'type',
 'title',
 'summary',
 'content',
 'likeCount',
 'clicksCount',
 'url',
 'publicTime',
 'orderNum',
 'originalFlag',
 'imageUrl',
 'sourceUrl',
 'videoUrl',
 'musicUrl',
 'audioUrl',
 'updateTime',
 'insertTime']

In [5]:
#cutword1 = cutword.head(1000).wordsCut
#cutword3 = cutword.head(1000).title  #i can also try to use only the article titles as bert input 

cutword2 = data16.head(500).content.dropna() #use 500 article content from rawdata as bert input


In [6]:
#I need to cut each article to the length of 100 characters because the tokenizer can only deal with a max length of 512
#the computation takes too long if the kept article length is long

cutword3 = []
for i in cutword2:
    dd = i[:100]
    cutword3.append(dd)

In [7]:
cutword3[0:5]

['今天最多人讨论的就是姚贝娜去世的新闻，满屏的悼念。但是她离开的杀手，大家是否足够了解？和梅艳芳同被乳腺癌夺取年轻的生命。今天转发一个视频希望更多女性重视这个疾病。［一分钟远离乳腺癌］VIDEO提起乳腺',
 '传递健康资讯 成就美好人生 今天不养生，明天养医生。请将健康观念传递给更多朋友。 更多精彩内容请添加小编个人微信：13861899065山有山的高度，水有水的深度，没必要攀比，每个人都有自己的长处；风',
 '↑↑↑巴西插画师 Butcher Billy 将世界各国政商圈的大爷们变成美漫里的大反派——而且看上去还挺符合他们的。他将这个系列命名为现实世界的“复仇者联盟”。Billy除了是个优秀的插画师，他还是',
 '传递健康资讯 成就美好人生 今天不养生，明天养医生。请将健康观念传递给更多朋友。 更多精彩内容请添加小编个人微信：13861899065（图为癌症患者绝望的流泪）所有人都想远离癌症、健康长寿，但你想知',
 '好贱，好贱的。哈哈！喜欢、请点底部，为一神打打气！']

In [8]:
cutword3 = pd.Series(cutword3)
tokenized = cutword3.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:
cutword3.shape

(498,)

In [10]:
#padding 
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
padded.shape

(498, 102)

In [11]:
#masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(498, 102)

In [12]:
input_ids = torch.tensor(padded)  
input_ids = torch.tensor(input_ids).to(torch.int64)
attention_mask = torch.tensor(attention_mask)


In [13]:
attention_mask.shape

torch.Size([498, 102])

In [14]:
input_ids.shape

torch.Size([498, 102])

In [15]:
# torch.no_grad tells PyTorch not to construct the compute graph during this forward pass (since we won’t be running backprop here)–this just reduces memory consumption and speeds things up a little.

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [16]:
#use only CLS (first element of each sentence) as the feature which captures all information of the sentence
features = last_hidden_states[0][:,0,:].numpy() 
features.shape
'''
The full set of hidden states for this model, stored in the object hidden_states, is a little dizzying. This object has four dimensions, in the following order:

The layer number (13 layers)
The batch number (1 sentence)
The word / token number (22 tokens in our sentence)
The hidden unit / feature number (768 features)
Wait, 13 layers? Doesn’t BERT only have 12? It’s 13 because the first element is the input embeddings, the rest is the outputs of each of BERT’s 12 layers.

That’s 219,648 unique values just to represent our one sentence!

The second dimension, the batch size, is used when submitting multiple sentences to the model at once; here, though, we just have one example sentence.

print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
'''

'\nThe full set of hidden states for this model, stored in the object hidden_states, is a little dizzying. This object has four dimensions, in the following order:\n\nThe layer number (13 layers)\nThe batch number (1 sentence)\nThe word / token number (22 tokens in our sentence)\nThe hidden unit / feature number (768 features)\nWait, 13 layers? Doesn’t BERT only have 12? It’s 13 because the first element is the input embeddings, the rest is the outputs of each of BERT’s 12 layers.\n\nThat’s 219,648 unique values just to represent our one sentence!\n\nThe second dimension, the batch size, is used when submitting multiple sentences to the model at once; here, though, we just have one example sentence.\n\nprint ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")\nlayer_i = 0\n\nprint ("Number of batches:", len(hidden_states[layer_i]))\nbatch_i = 0\n\nprint ("Number of tokens:", len(hidden_states[layer_i][batch_i]))\ntoken_i = 0\n\nprint ("Number of hidden 

In [30]:
#another way to get the feature is compute the average of all token vectors in each sentence
dd = last_hidden_states[0][:,1:101,:].numpy()
features2 = dd.mean(axis=1)
features2.shape

(498, 21128)

In [32]:
likeCount = data16.head(500).likeCount #use likecount as dependent variable 

df = data16[["likeCount", "content"]].head(1000)

removeinds = pd.isnull(df).any(1).to_numpy().nonzero()[0]

likeCount = [i for j, i in enumerate(likeCount) if j not in removeinds] # remove the two likecount where content is empty


In [37]:
train_features, test_features, train_output, test_output = train_test_split(features, likeCount)
train_features2, test_features2, train_output2, test_output2 = train_test_split(features2, likeCount)

In [39]:
#training error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


reg = LinearRegression().fit(train_features, train_output)
#reg.score(train_features, train_labels)
preds = reg.predict(train_features)
rmse = np.sqrt(mean_squared_error(np.array(train_output).astype(float), preds))
print("RMSE: %f" % (rmse))
print("R square: %f" % (r2_score(np.array(train_output).astype(float), preds)))


reg2 = LinearRegression().fit(train_features2, train_output2)
#reg.score(train_features, train_labels)
preds2 = reg.predict(train_features2)
rmse2 = np.sqrt(mean_squared_error(np.array(train_output2).astype(float), preds2))
print("RMSE: %f" % (rmse2))
print("R square: %f" % (r2_score(np.array(train_output2).astype(float), preds2)))

RMSE: 78.991804
R square: 0.687679
RMSE: 397.117361
R square: -6.254265


In [35]:
from statsmodels.formula.api  import ols
import statsmodels.api as sm
results = sm.OLS(np.array(train_output).astype(float), train_features.astype(float)).fit()
results.summary().tables[0]

0,1,2,3
Dep. Variable:,y,R-squared:,0.965
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1.007
Date:,"Sat, 30 Jan 2021",Prob (F-statistic):,0.543
Time:,22:55:54,Log-Likelihood:,-1749.2
No. Observations:,373,AIC:,4218.0
Df Residuals:,13,BIC:,5630.0
Df Model:,359,,
Covariance Type:,nonrobust,,


In [38]:

results2 = sm.OLS(np.array(train_output2).astype(float), train_features2.astype(float)).fit()
results2.summary().tables[0]

0,1,2,3
Dep. Variable:,y,R-squared:,0.952
Model:,OLS,Adj. R-squared:,-0.114
Method:,Least Squares,F-statistic:,0.8929
Date:,"Sat, 30 Jan 2021",Prob (F-statistic):,0.666
Time:,22:58:33,Log-Likelihood:,-1825.2
No. Observations:,373,AIC:,4364.0
Df Residuals:,16,BIC:,5764.0
Df Model:,356,,
Covariance Type:,nonrobust,,


In [59]:
#test error
preds = reg.predict(test_features)

rmse = np.sqrt(mean_squared_error(np.array(test_output).astype(float), preds))
print("RMSE: %f" % (rmse))

print("R square: %f" % (r2_score(np.array(test_output).astype(float), preds)))

RMSE: 242.183767
R square: -0.650406
