In [1]:
from ssestm import SSESTM
from tqdm import tqdm
import pandas as pd
import numpy as np

# 1. Load data

In [2]:
df = pd.read_excel("./data/article_new.xlsx")

In [3]:
msk = np.random.rand(len(df)) < 0.7
train = df[msk]
test = df[~msk]

In [4]:
train.to_excel("./data/train.xlsx")

In [5]:
test.to_excel("./data/test.xlsx")

KeyboardInterrupt: 

# 2. Model initializing

In [None]:
# If you initialize at the first time, you need to 
model = SSESTM(alpha_plus=0.3, alpha_minus=0.3, kappa=3,
                 reg=0.05, alpha_rate=0.001, max_iters=1000000, error=0.00000001, skip_params_gen=True)

In [None]:
model.train(path="./data/train.xlsx")

In [None]:
df = pd.read_excel("./data/test.xlsx")

In [None]:
model.load_params()
return_values = []
sentiments_scores = []
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    if not pd.isna(row["Return2"]) and not pd.isna(row["Content"]):
        try:
            return_values.append(row["Return2"])
            sentiments_scores.append(model.predict(row["Content"]))
        except KeyboardInterrupt:
            break
        except:
            # Exception for math domain error due to the limitation of gradient descent.
            return_values.pop()
            continue

# 3. Plot correlation

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
%matplotlib inline
matplotlib.style.use('ggplot')

return_values = np.array(return_values, dtype=float).reshape(-1, 1)
sentiments_scores = np.array(sentiments_scores, dtype=float).reshape(-1, 1)

sentiments_scores = np.where(np.isnan(sentiments_scores), sentiments_scores.mean(), sentiments_scores)

linear_regressor = LinearRegression()
linear_regressor.fit(sentiments_scores, return_values)

plt.scatter(sentiments_scores, return_values)

Y_pred = linear_regressor.predict(sentiments_scores)
plt.plot(sentiments_scores, Y_pred, color='blue')
plt.show()

In [None]:
pd.DataFrame({"sentiment": sentiments_scores, "return": return_values}).to_excel("./data/result.xlsx")

In [None]:
r2_score(return_values, Y_pred)

In [None]:
mod = sm.OLS(Y_pred, return_values)
fii = mod.fit()
p_values = fii.summary2().tables[1]['P>|t|']

In [None]:
p_values