In [1]:
import pandas as pd

from companies import earning_calls_id_ticker_map

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
df_earning_calls_sentiment = pd.read_csv("data/text_results/earning_calls_sentiment.csv")
df_news_sentiment = pd.read_csv("data/text_results/news_sentiment.csv")
df_reviews_sentiment = pd.read_csv("data/text_results/reviews_sentiment.csv", index_col=0)

In [5]:
df_train_car = pd.read_csv("data/train_data_CAR5.csv")
df_test_car = pd.read_csv("data/test_data_CAR5.csv")
df_train_rev = pd.read_csv("data/train_data_REV.csv")
df_test_rev = pd.read_csv("data/test_data_REV.csv")

## Join Earning Calls

In [6]:
df_earning_calls_sentiment.head()

Unnamed: 0,company,quarter,year,sentiment_score,confidence,complexity
0,1038351,Q1,2010,-0.518244,0.748589,-2.029088
1,1038351,Q2,2010,-0.236819,0.603098,-2.044513
2,1038351,Q3,2008,0.20191,0.57131,-0.693308
3,1038351,Q4,2009,0.345938,0.664661,-2.092329
4,1038352,Q1,2024,-0.792844,0.885112,0.532956


In [7]:
df_earning_calls_sentiment["year"].unique()

array(['2010', '2008', '2009', '2024', '2023', '2022', '2011', '2012',
       '2013', '2014', '2015', '2016', '2007', '2017', '2018', '2019',
       '2020', '2021', 'Unknown', '2006'], dtype=object)

In [8]:
df_earning_calls_sentiment

Unnamed: 0,company,quarter,year,sentiment_score,confidence,complexity
0,1038351,Q1,2010,-0.518244,0.748589,-2.029088
1,1038351,Q2,2010,-0.236819,0.603098,-2.044513
2,1038351,Q3,2008,0.201910,0.571310,-0.693308
3,1038351,Q4,2009,0.345938,0.664661,-2.092329
4,1038352,Q1,2024,-0.792844,0.885112,0.532956
...,...,...,...,...,...,...
3863,98045865,Q4,2012,0.875903,0.926368,-0.975581
3864,98045865,Q4,2013,0.634635,0.806643,0.580773
3865,98045865,Q4,2014,-0.194173,0.419272,-0.896915
3866,98045865,Q4,2015,-0.047659,0.511664,0.580773


In [9]:
df_earning_calls_sentiment["tic"] = df_earning_calls_sentiment["company"].map(earning_calls_id_ticker_map)

In [10]:
df_earning_calls_sentiment.isna().sum()

company            0
quarter            0
year               0
sentiment_score    0
confidence         0
complexity         0
tic                0
dtype: int64

In [11]:
df_earning_calls_sentiment["datacqtr"] = df_earning_calls_sentiment["year"].astype(str) + df_earning_calls_sentiment["quarter"]

In [12]:
df_earning_calls_sentiment = df_earning_calls_sentiment[["datacqtr", "tic", "sentiment_score", "confidence", "complexity"]].copy()

In [13]:
df_earning_calls_sentiment.rename(
    columns={
        "datacqtr": "datacqtr",
        "tic": "tic",
        "sentiment_score": "earning_calls_sentiment",
        "confidence": "earning_calls_confidence",
        "complexity": "earning_calls_complexity",
    },
    inplace=True,
)

In [14]:
df_earning_calls_sentiment.sort_values(
    by=["tic", "datacqtr"],
    ascending=[True, True],
    inplace=True,
)

In [15]:
df_earning_calls_sentiment = df_earning_calls_sentiment[df_earning_calls_sentiment["datacqtr"] != "UnknownUnknown"]

In [16]:
df_earning_calls_sentiment

Unnamed: 0,datacqtr,tic,earning_calls_sentiment,earning_calls_confidence,earning_calls_complexity
416,2007Q3,AFCL,0.827717,0.891771,0.423441
419,2007Q4,AFCL,0.694979,0.839299,-2.374602
412,2008Q1,AFCL,0.717063,0.842497,-1.210036
414,2008Q2,AFCL,0.840870,0.910443,-2.217270
417,2008Q3,AFCL,0.747791,0.862192,-2.451725
...,...,...,...,...,...
2245,2023Q4,ZION,0.722900,0.845726,0.408016
2227,2024Q1,ZION,0.792808,0.883398,-0.943189
2233,2024Q2,ZION,0.774977,0.876247,0.392591
2239,2024Q3,ZION,0.702208,0.839332,-0.881490


In [17]:
df_earning_calls_sentiment = df_earning_calls_sentiment.groupby(by=["tic", "datacqtr"]).mean()

In [18]:
df_earning_calls_sentiment = df_earning_calls_sentiment.reset_index()

In [19]:
df_earning_calls_sentiments = []
for ticker in df_earning_calls_sentiment["tic"].unique():
    df_temp: pd.DataFrame = df_earning_calls_sentiment[df_earning_calls_sentiment["tic"] == ticker].copy()
    df_temp = df_temp.sort_values(by="datacqtr", ascending=True)
    df_temp["datacqtr"] = df_temp["datacqtr"].shift(-1)
    df_temp = df_temp.dropna()
    df_earning_calls_sentiments.append(df_temp)
df_earning_calls_sentiment_lagged = pd.concat(df_earning_calls_sentiments)

In [20]:
df_earning_calls_sentiment_lagged

Unnamed: 0,tic,datacqtr,earning_calls_sentiment,earning_calls_confidence,earning_calls_complexity
0,AFCL,2007Q4,0.827717,0.891771,0.423441
1,AFCL,2008Q1,0.694979,0.839299,-2.374602
2,AFCL,2008Q2,0.717063,0.842497,-1.210036
3,AFCL,2008Q3,0.840870,0.910443,-2.217270
4,AFCL,2008Q4,0.747791,0.862192,-2.451725
...,...,...,...,...,...
3837,ZION,2023Q4,0.702880,0.843617,0.267651
3838,ZION,2024Q1,0.722900,0.845726,0.408016
3839,ZION,2024Q2,0.792808,0.883398,-0.943189
3840,ZION,2024Q3,0.774977,0.876247,0.392591


In [21]:
df_train_car_new = df_train_car.merge(df_earning_calls_sentiment_lagged, on=["tic", "datacqtr"], how="left")
df_train_car_new["earning_calls_sentiment"] = df_train_car_new["earning_calls_sentiment"].fillna(0)
df_train_car_new["earning_calls_confidence"] = df_train_car_new["earning_calls_confidence"].fillna(0.5)
df_train_car_new["earning_calls_complexity"] = df_train_car_new["earning_calls_complexity"].fillna(0)

In [22]:
df_test_car_new = df_test_car.merge(df_earning_calls_sentiment_lagged, on=["tic", "datacqtr"], how="left")
df_test_car_new["earning_calls_sentiment"] = df_test_car_new["earning_calls_sentiment"].fillna(0)
df_test_car_new["earning_calls_confidence"] = df_test_car_new["earning_calls_confidence"].fillna(0.5)
df_test_car_new["earning_calls_complexity"] = df_test_car_new["earning_calls_complexity"].fillna(0)

In [23]:
df_train_rev_new = df_train_rev.merge(df_earning_calls_sentiment_lagged, on=["tic", "datacqtr"], how="left")
df_train_rev_new["earning_calls_sentiment"] = df_train_rev_new["earning_calls_sentiment"].fillna(0)
df_train_rev_new["earning_calls_confidence"] = df_train_rev_new["earning_calls_confidence"].fillna(0.5)
df_train_rev_new["earning_calls_complexity"] = df_train_rev_new["earning_calls_complexity"].fillna(0)

In [24]:
df_test_rev_new = df_test_rev.merge(df_earning_calls_sentiment_lagged, on=["tic", "datacqtr"], how="left")
df_test_rev_new["earning_calls_sentiment"] = df_test_rev_new["earning_calls_sentiment"].fillna(0)
df_test_rev_new["earning_calls_confidence"] = df_test_rev_new["earning_calls_confidence"].fillna(0.5)
df_test_rev_new["earning_calls_complexity"] = df_test_rev_new["earning_calls_complexity"].fillna(0)

In [25]:
# check length
print(len(df_train_car), len(df_train_car_new))
print(len(df_test_car), len(df_test_car_new))
print(len(df_train_rev), len(df_train_rev_new))
print(len(df_test_rev), len(df_test_rev_new))

8019 8019
1483 1483
7832 7832
1392 1392


## Join News Sentiment

In [26]:
df_news_sentiment.head()

Unnamed: 0,tickers,year,quarter,sentiment_score,confidence,complexity_score
0,AMAL,2024,4,-0.999975,0.99998,0.566283
1,AMTB,2024,4,-0.994198,0.995858,0.243747
2,BAC,2024,2,-0.447598,0.852171,0.394211
3,BAC,2024,3,-0.498395,0.934875,0.24792
4,BAC,2024,4,-0.421286,0.975798,0.316521


In [28]:
df_news_sentiment["datacqtr"] = df_news_sentiment["year"].astype(str) + "Q" + df_news_sentiment["quarter"].astype(str)

In [30]:
df_news_sentiment = df_news_sentiment.rename(columns={"tickers":"tic", "sentiment_score": "news_sentiment", "confidence": "news_confidence", "complexity_score": "news_complexity_score"})

In [32]:
df_news_sentiment = df_news_sentiment[["tic", "datacqtr", "news_sentiment", "news_confidence", "news_complexity_score"]].copy()

In [37]:
df_train_car_new = df_train_car_new.merge(df_news_sentiment, on=["tic", "datacqtr"], how="left")
df_train_car_new["news_sentiment"] = df_train_car_new["news_sentiment"].fillna(0)
df_train_car_new["news_confidence"] = df_train_car_new["news_confidence"].fillna(0.5)
df_train_car_new["news_complexity_score"] = df_train_car_new["news_complexity_score"].fillna(0)

In [39]:
df_test_car_new = df_test_car_new.merge(df_news_sentiment, on=["tic", "datacqtr"], how="left")
df_test_car_new["news_sentiment"] = df_test_car_new["news_sentiment"].fillna(0)
df_test_car_new["news_confidence"] = df_test_car_new["news_confidence"].fillna(0.5)
df_test_car_new["news_complexity_score"] = df_test_car_new["news_complexity_score"].fillna(0)

In [40]:
df_train_rev_new = df_train_rev_new.merge(df_news_sentiment, on=["tic", "datacqtr"], how="left")
df_train_rev_new["news_sentiment"] = df_train_rev_new["news_sentiment"].fillna(0)
df_train_rev_new["news_confidence"] = df_train_rev_new["news_confidence"].fillna(0.5)
df_train_rev_new["news_complexity_score"] = df_train_rev_new["news_complexity_score"].fillna(0)

In [41]:
df_test_rev_new = df_test_rev_new.merge(df_news_sentiment, on=["tic", "datacqtr"], how="left")
df_test_rev_new["news_sentiment"] = df_test_rev_new["news_sentiment"].fillna(0)
df_test_rev_new["news_confidence"] = df_test_rev_new["news_confidence"].fillna(0.5)
df_test_rev_new["news_complexity_score"] = df_test_rev_new["news_complexity_score"].fillna(0)

In [42]:
# check length
print(len(df_train_car), len(df_train_car_new))
print(len(df_test_car), len(df_test_car_new))
print(len(df_train_rev), len(df_train_rev_new))
print(len(df_test_rev), len(df_test_rev_new))

8019 8019
1483 1483
7832 7832
1392 1392


## Join Reviews Sentiment

In [43]:
df_reviews_sentiment.describe()

Unnamed: 0,reviews_rating,text_blob_reviews_sentiment,vader_reviews_sentiment_neg,vader_reviews_sentiment_pos,bert_reviews_label,bert_reviews_score
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,0.606557,0.2197,0.065272,0.248015,0.590013,0.687629
std,0.249097,0.198478,0.057149,0.12875,0.251946,0.098038
min,0.0,-0.695,0.0,0.0,0.0,0.256037
25%,0.46132,0.106081,0.031357,0.17657,0.45,0.654208
50%,0.590971,0.208471,0.059839,0.232642,0.583333,0.690745
75%,0.777083,0.328077,0.087447,0.301083,0.75,0.733813
max,1.0,1.0,0.756,1.0,1.0,0.982982


In [44]:
df_train_car_new = df_train_car_new.merge(df_reviews_sentiment, on=["tic", "datacqtr"], how="left")
df_train_car_new["reviews_rating"] = df_train_car_new["reviews_rating"].fillna(0.5)
df_train_car_new["text_blob_reviews_sentiment"] = df_train_car_new["text_blob_reviews_sentiment"].fillna(0)
df_train_car_new["vader_reviews_sentiment_neg"] = df_train_car_new["vader_reviews_sentiment_neg"].fillna(0.5)
df_train_car_new["vader_reviews_sentiment_pos"] = df_train_car_new["vader_reviews_sentiment_pos"].fillna(0.5)
df_train_car_new["bert_reviews_label"] = df_train_car_new["bert_reviews_label"].fillna(0.5)
df_train_car_new["bert_reviews_score"] = df_train_car_new["bert_reviews_score"].fillna(0.5)

In [45]:
df_test_car_new = df_test_car_new.merge(df_reviews_sentiment, on=["tic", "datacqtr"], how="left")
df_test_car_new["reviews_rating"] = df_test_car_new["reviews_rating"].fillna(0.5)
df_test_car_new["text_blob_reviews_sentiment"] = df_test_car_new["text_blob_reviews_sentiment"].fillna(0)
df_test_car_new["vader_reviews_sentiment_neg"] = df_test_car_new["vader_reviews_sentiment_neg"].fillna(0.5)
df_test_car_new["vader_reviews_sentiment_pos"] = df_test_car_new["vader_reviews_sentiment_pos"].fillna(0.5)
df_test_car_new["bert_reviews_label"] = df_test_car_new["bert_reviews_label"].fillna(0.5)
df_test_car_new["bert_reviews_score"] = df_test_car_new["bert_reviews_score"].fillna(0.5)

In [46]:
df_train_rev_new = df_train_rev_new.merge(df_reviews_sentiment, on=["tic", "datacqtr"], how="left")
df_train_rev_new["reviews_rating"] = df_train_rev_new["reviews_rating"].fillna(0.5)
df_train_rev_new["text_blob_reviews_sentiment"] = df_train_rev_new["text_blob_reviews_sentiment"].fillna(0)
df_train_rev_new["vader_reviews_sentiment_neg"] = df_train_rev_new["vader_reviews_sentiment_neg"].fillna(0.5)
df_train_rev_new["vader_reviews_sentiment_pos"] = df_train_rev_new["vader_reviews_sentiment_pos"].fillna(0.5)
df_train_rev_new["bert_reviews_label"] = df_train_rev_new["bert_reviews_label"].fillna(0.5)
df_train_rev_new["bert_reviews_score"] = df_train_rev_new["bert_reviews_score"].fillna(0.5)

In [47]:
df_test_rev_new = df_test_rev_new.merge(df_reviews_sentiment, on=["tic", "datacqtr"], how="left")
df_test_rev_new["reviews_rating"] = df_test_rev_new["reviews_rating"].fillna(0.5)
df_test_rev_new["text_blob_reviews_sentiment"] = df_test_rev_new["text_blob_reviews_sentiment"].fillna(0)
df_test_rev_new["vader_reviews_sentiment_neg"] = df_test_rev_new["vader_reviews_sentiment_neg"].fillna(0.5)
df_test_rev_new["vader_reviews_sentiment_pos"] = df_test_rev_new["vader_reviews_sentiment_pos"].fillna(0.5)
df_test_rev_new["bert_reviews_label"] = df_test_rev_new["bert_reviews_label"].fillna(0.5)
df_test_rev_new["bert_reviews_score"] = df_test_rev_new["bert_reviews_score"].fillna(0.5)

In [48]:
# check length
print(len(df_train_car), len(df_train_car_new))
print(len(df_test_car), len(df_test_car_new))
print(len(df_train_rev), len(df_train_rev_new))
print(len(df_test_rev), len(df_test_rev_new))

8019 8019
1483 1483
7832 7832
1392 1392


In [49]:
# countna
print(df_train_car_new.isna().sum().sum())
print(df_test_car_new.isna().sum().sum())
print(df_train_rev_new.isna().sum().sum())
print(df_test_rev_new.isna().sum().sum())

0
0
0
0


In [50]:
df_train_rev_new.sample(frac=1).head(50)

Unnamed: 0,datacqtr,tic,Total Current Operating Revenue,GDP CHANGE (-1 to 1),UNEMPLOYMENT RATE (0 to 1),PRIME LOAN RATE (0 to 1),DEPOSITS CHANGE (-1 to 1),CONSUMER PRICE INDEX (0 to 1),SAVINGS PER GROSS INCOME (-1 to 1),Net Interest Income,...,earning_calls_complexity,news_sentiment,news_confidence,news_complexity_score,reviews_rating,text_blob_reviews_sentiment,vader_reviews_sentiment_neg,vader_reviews_sentiment_pos,bert_reviews_label,bert_reviews_score
2770,2013Q1,FISI,0.321949,0.566057,0.439716,0.0,0.162342,0.565214,0.742857,0.266611,...,0.0,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
7222,2011Q4,VLY,0.489488,0.557621,0.535461,0.0,0.065829,0.485931,0.6,0.459933,...,-1.021855,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
2566,2012Q1,FFBC,0.437451,0.568577,0.496454,0.0,0.145928,0.557918,0.757143,0.385361,...,-1.085096,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
3623,2009Q1,HBAN,0.625605,0.413267,0.496454,0.0,0.132365,0.727719,0.014286,0.593415,...,-1.052704,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
5548,2007Q2,PNBK,0.249771,0.561612,0.095745,1.0,0.148962,0.840176,0.785714,0.109055,...,0.0,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
4302,2012Q1,MBFI,0.441909,0.568577,0.496454,0.0,0.145928,0.557918,0.757143,0.415241,...,1.837886,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
3562,2011Q1,HAFC,0.345449,0.500676,0.578014,0.0,0.082293,0.154854,0.357143,0.277172,...,0.830653,0.0,0.5,0.0,0.0,0.111012,0.169,0.146,0.0,0.505848
1836,2003Q3,DCOM,0.369837,0.616419,0.269504,0.15,0.201673,0.598968,0.728571,0.092336,...,0.0,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
747,2009Q4,BNCN,0.288013,0.566848,0.673759,0.0,0.139093,0.280827,0.157143,0.20203,...,0.0,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5
7571,2012Q1,WSBC,0.396754,0.568577,0.496454,0.0,0.145928,0.557918,0.757143,0.337167,...,-0.849098,0.0,0.5,0.0,0.5,0.0,0.5,0.5,0.5,0.5


In [51]:
len(df_train_rev_new.columns) - 2

55

In [52]:
len(df_train_car_new.columns) - 2

38

In [53]:
# check length
df_train_car_new.to_csv("data/train_data_CAR5_with_text.csv", index=False)
df_test_car_new.to_csv("data/test_data_CAR5_with_text.csv", index=False)
df_train_rev_new.to_csv("data/train_data_REV_with_text.csv", index=False)
df_test_rev_new.to_csv("data/test_data_REV_with_text.csv", index=False)