In [1]:
# Import packages
import ast
from collections import Counter, defaultdict
import numpy as np
import pandas as pd

# Combine Comments Dataframe

In [4]:
# Load comment features
df_comments_pratik = pd.read_csv("data/feature_comments_pratik.csv", delimiter=";", encoding="utf-8")
df_comments_rakib = pd.read_csv("data/feature_comments_rakib.csv", delimiter=";", encoding="utf-8")
df_comments_dami = pd.read_csv("data/feature_comments_dami.csv", delimiter=";", encoding="utf-8")

In [5]:
# Merge three comment dataframes to create a single comment dataframe
df_comments_combined = pd.concat([df_comments_rakib, df_comments_pratik, df_comments_dami], axis=0, ignore_index=True)

In [9]:
def agg_counter_objs(rows):
    new_counter_objs = []
    for row in rows:
        row_dict = ast.literal_eval(row)
        
        def default_val():
            return 0
        
        new_dict = defaultdict(default_val)
        for key in row_dict.keys():
            new_key = key.replace("'","").replace('"',"").replace("]","").replace("\\","").strip()
            new_dict[new_key] += row_dict[key]
        
        if "" in new_dict.keys():
            del new_dict[""]
        
        new_counter_objs.append(Counter(new_dict))
    
    agg_counter_obj = Counter({})
    for obj in new_counter_objs:
        agg_counter_obj += obj
        
    return str(dict(agg_counter_obj))

In [7]:
# Group comment dataframe by date
df_comments_grouped = df_comments_combined.groupby("created_utc_x")
df_comments = df_comments_grouped["body_polarity"].sum().reset_index()
df_comments["body_subjectivity"] = df_comments_grouped["body_subjectivity"].sum().reset_index()["body_subjectivity"]
df_comments["count"] = df_comments_grouped["count"].sum().reset_index()["count"]
df_comments["score_x"] = df_comments_grouped["score_x"].sum().reset_index()["score_x"]
df_comments["emojis"] = df_comments_grouped["emojis"].apply(agg_counter_objs).reset_index()["emojis"]
df_comments["sp500_stocks"] = df_comments_grouped["sp500_stocks"].apply(agg_counter_objs).reset_index()["sp500_stocks"]
df_comments["non_sp500_stocks"] = df_comments_grouped["non_sp500_stocks"].apply(agg_counter_objs).reset_index()["non_sp500_stocks"]
df_comments["sp500_lingos"] = df_comments_grouped["sp500_lingos"].apply(agg_counter_objs).reset_index()["sp500_lingos"]
df_comments["non_sp500_lingos"] = df_comments_grouped["non_sp500_lingos"].apply(agg_counter_objs).reset_index()["non_sp500_lingos"]

df_comments.columns = ["created_utc", "body_polarity", "body_subjectivity", "count", "score", "emojis",
                      "sp500_stocks", "non_sp500_stocks", "sp500_lingos", "non_sp500_lingos"]
df_comments.head()

Unnamed: 0,created_utc,body_polarity,body_subjectivity,count,score,emojis,sp500_stocks,non_sp500_stocks,sp500_lingos,non_sp500_lingos
0,2020-01-01,130.939615,820.633747,2543,3526,"{'🙏': 7, '👌': 5, '🤦': 2, '🏻': 6, 'u': 11, '2':...","{'DD': 18, 'K': 1, 'AMZN': 1, 'MU': 5, 'BA': 7...","{'OGI': 1, 'PTON': 4, 'TSLA': 14, 'ON': 5, 'SP...","{'DD': 18, 'buying': 5, 'go': 6, 'high': 4, 'B...","{'buying': 3, 'DD': 2, 'go': 4, 'holding': 1, ..."
1,2020-01-02,532.971197,4045.896734,12594,28751,"{'🔥': 39, '😘': 11, '⚠': 1, '️': 131, '♻': 1, '...","{'MSFT': 55, 'M': 25, 'A': 96, 'D': 23, 'AMD':...","{'BYND': 5, 'TSLA': 94, 'LMAO': 4, 'CRON': 7, ...","{'DD': 30, 'go': 51, 'moon': 18, 'holding': 29...","{'DD': 3, 'go': 22, 'buying': 9, 'GO': 19, 'YO..."
2,2020-01-03,572.975674,5150.85453,16276,87883,"{'👈': 54, '👍': 69, '🏻': 131, '👋': 4, '🙄': 8, '...","{'ALL': 15, 'A': 98, 'GT': 1, 'LEG': 5, 'AMD':...","{'LOVE': 5, 'WW': 56, 'PT': 7, 'TSLA': 148, 'T...","{'go': 62, 'high': 15, 'DD': 37, 'holding': 31...","{'go': 18, 'high': 10, 'holding': 5, 'buying':..."
3,2020-01-04,275.415978,1741.325526,5419,26025,"{'🙏': 14, '😐': 2, '🇺': 59, '🇸': 59, '👀': 6, '💋...","{'IP': 1, 'VZ': 1, 'AAPL': 31, 'NVDA': 4, 'AMD...","{'LULU': 2, 'TSLA': 36, 'PTON': 8, 'BBQ': 2, '...","{'buying': 9, 'high': 13, 'go': 19, 'squeeze':...","{'buying': 4, 'go': 8, 'holding': 2, 'TENDIES'..."
4,2020-01-05,253.327217,1925.347894,6162,36107,"{'😂': 42, '🚀': 99, '🤏': 1, '👌': 18, '🏽': 11, '...","{'A': 57, 'GD': 7, 'LMT': 54, 'BA': 43, 'K': 1...","{'JD': 1, 'SLS': 1, 'UK': 10, 'AAL': 2, 'TSLA'...","{'holding': 12, 'YOLO': 2, 'go': 24, 'high': 9...","{'high': 2, 'holding': 1, 'YOLO': 1, 'go': 6, ..."


In [6]:
# Save the combined comments dataframe
df_comments.to_csv("data/feature_comments.csv", sep=";", index=False)

# Create Features Dataframe with Submissions and Comments

In [2]:
# Load feature_submissions and feature_comments dataframe
df_submissions = pd.read_csv("data/feature_submissions.csv", delimiter=";", index_col="created_utc", encoding="utf-8")
df_comments = pd.read_csv("data/feature_comments.csv", delimiter=";", index_col="created_utc", encoding="utf-8")

In [3]:
df_submissions[["title_polarity","title_subjectivity","body_polarity","body_subjectivity","emojis","category","sp500_stocks",
                "sp500_lingos","count", "score", "upvote_ratio"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 443 entries, 2020-01-01 to 2021-03-31
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title_polarity      443 non-null    float64
 1   title_subjectivity  443 non-null    float64
 2   body_polarity       443 non-null    float64
 3   body_subjectivity   443 non-null    float64
 4   emojis              443 non-null    object 
 5   category            443 non-null    object 
 6   sp500_stocks        443 non-null    object 
 7   sp500_lingos        443 non-null    object 
 8   count               443 non-null    int64  
 9   score               443 non-null    int64  
 10  upvote_ratio        443 non-null    float64
dtypes: float64(5), int64(2), object(4)
memory usage: 41.5+ KB


In [5]:
df_comments[["body_polarity","body_subjectivity","emojis","sp500_stocks","sp500_lingos","count", "score"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 438 entries, 2020-01-01 to 2021-03-31
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   body_polarity      438 non-null    float64
 1   body_subjectivity  438 non-null    float64
 2   emojis             438 non-null    object 
 3   sp500_stocks       438 non-null    object 
 4   sp500_lingos       438 non-null    object 
 5   count              438 non-null    int64  
 6   score              438 non-null    int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 27.4+ KB


In [6]:
# Create the feature dataframe combining submissions and comments features
df_features = pd.concat([df_submissions[["title_polarity", "title_subjectivity", "body_polarity", "body_subjectivity", 
                                         "count", "score", "upvote_ratio"]], 
                        df_comments[["body_polarity", "body_subjectivity", "count", "score"]]], axis=1)
df_features.columns = ["title_polarity", "title_subjectivity", "submission_body_polarity", "submission_body_subjectivity", 
                      "submission_count", "submission_score", "upvote_ratio", "comment_body_polarity", 
                      "comment_body_subjectivity", "comment_count", "comment_score"]
df_features

Unnamed: 0_level_0,title_polarity,title_subjectivity,submission_body_polarity,submission_body_subjectivity,submission_count,submission_score,upvote_ratio,comment_body_polarity,comment_body_subjectivity,comment_count,comment_score
created_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-01,7.303436,21.868398,6.478874,33.971240,104,104,0.00,130.939615,820.633747,2543.0,3526.0
2020-01-02,4.819957,36.665525,0.440766,52.279186,139,2192,0.00,532.971197,4045.896734,12594.0,28751.0
2020-01-03,4.492971,28.560375,5.997937,41.947068,115,36580,0.00,572.975674,5150.854530,16276.0,87883.0
2020-01-04,1.844444,4.022222,0.093232,7.623026,28,16584,0.00,275.415978,1741.325526,5419.0,26025.0
2020-01-05,1.537381,15.433770,4.989659,22.903422,67,460,0.00,253.327217,1925.347894,6162.0,36107.0
...,...,...,...,...,...,...,...,...,...,...,...
2021-03-27,8.446676,33.257374,6.096034,24.403107,165,183,164.13,80.916034,363.794892,1156.0,2870.0
2021-03-28,32.854605,109.840038,16.024446,59.274937,444,764,431.25,1155.309506,5202.869285,15497.0,17001.0
2021-03-29,56.175202,301.651219,38.316436,178.934077,1187,1253,1183.16,3854.146727,24189.232163,77539.0,195332.0
2021-03-30,63.219496,262.440205,37.621248,171.155115,1180,1200,1178.24,4048.448978,23129.645792,74152.0,278081.0


In [7]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 443 entries, 2020-01-01 to 2021-03-31
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   title_polarity                443 non-null    float64
 1   title_subjectivity            443 non-null    float64
 2   submission_body_polarity      443 non-null    float64
 3   submission_body_subjectivity  443 non-null    float64
 4   submission_count              443 non-null    int64  
 5   submission_score              443 non-null    int64  
 6   upvote_ratio                  443 non-null    float64
 7   comment_body_polarity         438 non-null    float64
 8   comment_body_subjectivity     438 non-null    float64
 9   comment_count                 438 non-null    float64
 10  comment_score                 438 non-null    float64
dtypes: float64(9), int64(2)
memory usage: 41.5+ KB


In [10]:
# Combine emojis feature for submissions and comments
df_emojis = pd.concat([df_submissions["emojis"], df_comments["emojis"]], axis=0)
df_emojis = df_emojis.groupby(df_emojis.index).apply(agg_counter_objs).reset_index()
df_emojis = df_emojis["emojis"].apply(lambda x: ast.literal_eval(x))
df_emojis = pd.DataFrame(df_emojis.tolist())
df_emojis["created_utc"] = df_submissions.index
df_emojis.set_index("created_utc", inplace=True)

# Combine sp500_stocks feature for submissions and comments
df_sp500_stocks = pd.concat([df_submissions["sp500_stocks"], df_comments["sp500_stocks"]], axis=0)
df_sp500_stocks = df_sp500_stocks.groupby(df_sp500_stocks.index).apply(agg_counter_objs).reset_index()
df_sp500_stocks = df_sp500_stocks["sp500_stocks"].apply(lambda x: ast.literal_eval(x))
df_sp500_stocks = pd.DataFrame(df_sp500_stocks.tolist())
df_sp500_stocks["created_utc"] = df_submissions.index
df_sp500_stocks.set_index("created_utc", inplace=True)

# Combine non_sp500_stocks feature for submissions and comments
df_non_sp500_stocks = pd.concat([df_submissions["non_sp500_stocks"], df_comments["non_sp500_stocks"]], axis=0)
df_non_sp500_stocks = df_non_sp500_stocks.groupby(df_non_sp500_stocks.index).apply(agg_counter_objs).reset_index()
df_non_sp500_stocks = df_non_sp500_stocks["non_sp500_stocks"].apply(lambda x: ast.literal_eval(x))
df_non_sp500_stocks = pd.DataFrame(df_non_sp500_stocks.tolist())
df_non_sp500_stocks["created_utc"] = df_submissions.index
df_non_sp500_stocks.set_index("created_utc", inplace=True)
df_non_sp500_stocks

# Combine sp500_lingos feature for submissions and comments
df_sp500_lingos = pd.concat([df_submissions["sp500_lingos"], df_comments["sp500_lingos"]], axis=0)
df_sp500_lingos = df_sp500_lingos.groupby(df_sp500_lingos.index).apply(agg_counter_objs).reset_index()
df_sp500_lingos = df_sp500_lingos["sp500_lingos"].apply(lambda x: ast.literal_eval(x))
df_sp500_lingos = pd.DataFrame(df_sp500_lingos.tolist())
df_sp500_lingos["created_utc"] = df_submissions.index
df_sp500_lingos.set_index("created_utc", inplace=True)

# Combine non_sp500_lingos feature for submissions and comments
df_non_sp500_lingos = pd.concat([df_submissions["non_sp500_lingos"], df_comments["non_sp500_lingos"]], axis=0)
df_non_sp500_lingos = df_non_sp500_lingos.groupby(df_non_sp500_lingos.index).apply(agg_counter_objs).reset_index()
df_non_sp500_lingos = df_non_sp500_lingos["non_sp500_lingos"].apply(lambda x: ast.literal_eval(x))
df_non_sp500_lingos = pd.DataFrame(df_non_sp500_lingos.tolist())
df_non_sp500_lingos["created_utc"] = df_submissions.index
df_non_sp500_lingos.set_index("created_utc", inplace=True)

# For submissions category
df_submissions["category"] = df_submissions["category"].apply(lambda x: ast.literal_eval(x))
df_submissions_category = pd.DataFrame(df_submissions["category"].tolist())
df_submissions_category["created_utc"] = df_submissions.index
df_submissions_category.set_index("created_utc", inplace=True)

In [11]:
# Create the feature dataframe that will be feed into machine learning model
df_features = pd.concat([df_features, 
                         df_emojis, 
                         df_submissions_category, 
                         df_sp500_stocks, 
                         #df_non_sp500_stocks, 
                         df_sp500_lingos], axis=1)
                         #df_non_sp500_lingos
df_features

Unnamed: 0_level_0,title_polarity,title_subjectivity,submission_body_polarity,submission_body_subjectivity,submission_count,submission_score,upvote_ratio,comment_body_polarity,comment_body_subjectivity,comment_count,...,DD,buying,high,BUY,holding,GO,squeeze,BUYING,TENDIES,BAGHOLDER
created_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,7.303436,21.868398,6.478874,33.971240,104,104,0.00,130.939615,820.633747,2543.0,...,23.0,7.0,5.0,1.0,2.0,,,,,
2020-01-02,4.819957,36.665525,0.440766,52.279186,139,2192,0.00,532.971197,4045.896734,12594.0,...,32.0,31.0,15.0,6.0,31.0,9.0,2.0,3.0,,
2020-01-03,4.492971,28.560375,5.997937,41.947068,115,36580,0.00,572.975674,5150.854530,16276.0,...,44.0,27.0,20.0,7.0,32.0,9.0,2.0,2.0,2.0,
2020-01-04,1.844444,4.022222,0.093232,7.623026,28,16584,0.00,275.415978,1741.325526,5419.0,...,25.0,10.0,14.0,,7.0,,2.0,,1.0,
2020-01-05,1.537381,15.433770,4.989659,22.903422,67,460,0.00,253.327217,1925.347894,6162.0,...,21.0,11.0,9.0,2.0,13.0,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-27,8.446676,33.257374,6.096034,24.403107,165,183,164.13,80.916034,363.794892,1156.0,...,21.0,10.0,5.0,1.0,8.0,,12.0,,,
2021-03-28,32.854605,109.840038,16.024446,59.274937,444,764,431.25,1155.309506,5202.869285,15497.0,...,144.0,59.0,30.0,6.0,41.0,2.0,38.0,1.0,2.0,
2021-03-29,56.175202,301.651219,38.316436,178.934077,1187,1253,1183.16,3854.146727,24189.232163,77539.0,...,447.0,193.0,89.0,42.0,139.0,19.0,178.0,10.0,7.0,
2021-03-30,63.219496,262.440205,37.621248,171.155115,1180,1200,1178.24,4048.448978,23129.645792,74152.0,...,376.0,152.0,85.0,34.0,120.0,43.0,127.0,4.0,9.0,


In [12]:
df_features.reset_index(level=0, inplace=True)

In [33]:
df_features.iloc[:5,10:]

Unnamed: 0,comment_count,comment_score,🙏,🐂,👅,🇺,🇸,🐻,👌,🤦,...,DD,buying,high,BUY,holding,GO,squeeze,BUYING,TENDIES,BAGHOLDER
0,2543.0,3526.0,8.0,1.0,1.0,1.0,1.0,6.0,5.0,2.0,...,23.0,7.0,5.0,1.0,2.0,,,,,
1,12594.0,28751.0,79.0,7.0,1.0,73.0,69.0,39.0,26.0,3.0,...,32.0,31.0,15.0,6.0,31.0,9.0,2.0,3.0,,
2,16276.0,87883.0,36.0,13.0,1.0,448.0,418.0,99.0,30.0,3.0,...,44.0,27.0,20.0,7.0,32.0,9.0,2.0,2.0,2.0,
3,5419.0,26025.0,14.0,6.0,,59.0,59.0,8.0,17.0,3.0,...,25.0,10.0,14.0,,7.0,,2.0,,1.0,
4,6162.0,36107.0,8.0,2.0,2.0,164.0,151.0,13.0,20.0,1.0,...,21.0,11.0,9.0,2.0,13.0,1.0,,,,


In [34]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443 entries, 0 to 442
Columns: 1856 entries, created_utc to BAGHOLDER
dtypes: float64(1851), int64(4), object(1)
memory usage: 6.3+ MB


In [48]:
# Save the final features dataframe
df_features.to_csv("data/features_sp500.csv", sep=";", index=False)