#Data extraction procedures

## StockTwits

We adapt the methods found in https://github.com/p-hiroshige/stockTwitsAPI, to extract the stocktwits data.

In [None]:
import os
import json
import pandas as pd
sc = Collector()

# save the messages on files splitted per chunk from a date to max ID
tickers = ['CRO.X', 'SOL.X', 'ALGO.X']
chunk = sc.save_history({'symbols': tickers[0], 'start': '2021-11-01T00:00:00Z',  'chunk': 'month'})

In [None]:
# Extract the monthly data from json format, into a single dictionary of pandas DataFrames.
stocktwits_monthly = {}

with open('history.20220601.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2022_06'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]



with open('history.20220501.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2022_05'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]

with open('history.20220401.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2022_04'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]



with open('history.20220301.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2022_03'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]





with open('history.20220201.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2022_02'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]




with open('history.20220101.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2022_01'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]





with open('history.20211201.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2021_12'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]

with open('history.20211101.json', 'r') as f:
    data = json.loads(f.read())
df = pd.json_normalize(data,
                       meta=[
        'id', 'body', 'created_at',
        ['user', 'id'],
        ['user', 'username'],
        ['entities', 'sentiment', 'basic']
    ]
)
stocktwits_monthly['2021_11'] = df[['id', 'body', 'created_at', 'user.username', 'entities.sentiment.basic']]

In [None]:
# Save the dictionary to excel, with one dataframe per sheet
import xlsxwriter
fn_save = path + 'StockTwits_CRO_M.xlsx'
with pd.ExcelWriter(fn_save) as writer:
  for df_name, df in stocktwits_monthly.items():
    print(df_name)
    df.to_excel(writer, sheet_name=df_name, engine = "xlsxwriter")

## Reddit

To extract data from reddit, we use the reddit data collector API, sourced from https://github.com/nicovandenhooff/reddit-data-collector

In [None]:
!pip install praw
!pip install reddit-data-collector

In [None]:
import pandas as pd
import praw
import tqdm

In [None]:
# Personal information to be filled by user
personal_script = 'personal-script'
secret_reddit = 'secret-reddit'
username = 'user-name'
pw = 'pass-word'
user_agent = 'MyAPI/0.0.1'

In [None]:
import reddit_data_collector as rdc
data_collector = rdc.DataCollector(
    client_id=personal_script,
    client_secret=secret_reddit,
    user_agent=user_agent,
    username=username,
    password=pw
)

In [None]:
posts, comments = data_collector.get_data(
    subreddits=["WallStreetBetsCrypto", "CryptoMoonShots", "CryptoCurrency", "Bitcoin", "SHIBArmy", "Shibainucoin", "ethtrader"],
    post_filter="top",
    top_post_filter = "year",
    comment_data=True,
    replies_data=True,
    replace_more_limit=16,
    dataframe=True
)

In [None]:
print(comments.shape)

In [None]:
df_reddit_posts = posts
df_reddit_comments = comments
with pd.ExcelWriter('Reddit_Corpora.xlsx') as writer:  
    df_reddit_posts.to_excel(writer, sheet_name='Reddit_Posts')
    df_reddit_comments.to_excel(writer, sheet_name='Reddit_Comments')

## The following classes are used for extracting stocktwits data 

In [None]:
import requests
import json

class twitStreamer():

    def __init__(self):
        self.url = "https://api.stocktwits.com/api/2/"
        self.headers = {'Content-Type': 'application/json'}

    def get_user_msgs(self, user_id, since=0, max=0, limit=0, callback=None, filter=None):

        """Returns the most recent 30 messages for the specified user.

        Args:
            user_id (int) = User ID or Username of the stream's user
                            you want to show (Required)
            since (int) = Returns results with an ID greater than (
                          more recent than) the specified ID.
            max (int) = Returns results with an ID less than
                        (older than) or equal to the specified ID.
            limit (int) = Default and max limit is 30.
                          This limit must be a number under 30.
            callback = Define your own callback function name,
                       add this parameter as the value.
            filter (string) = Filter messages by links, charts, or videos.
                              (Optional)

        Return:
            raw_json (dict) = The JSON output unparsed

        """

        url = self.url + 'streams/user/' + user_id + '.json'

        data = {
                 'since': '{}'.format(since),
                 'max': '{}'.format(max),
                 'limit': '{}'.format(limit),
                 # Fix when you figure out what this is
                 # 'callback' : '{}'.format(None),
                 'filter': '{}'.format(filter)
                }

        r = requests.get(url, headers=self.headers, params=data)
        if r.status_code != 200:
            raise Exception('Unable to Return Request {}'
                            .format(r.status_code))

        raw_json = r.json()
        return raw_json

    def get_symbol_msgs(self, symbol_id, since=0, max=0, limit=0, callback=None, filter=None):

        '''Returns the most recent 30 messages for the specified symbol.

        Args:
            symbol_id:	Ticker symbol, Stock ID, or
                        RIC code of the symbol (Required)
            since:	Returns results with an ID greater than (more recent than)
                    the specified ID.
            max:	Returns results with an ID less than (older than) or
                    equal to the specified ID.
            limit:	Default and max limit is 30. This limit must be a
                    number under 30.
            callback:	Define your own callback function name,
                        add this parameter as the value.
            filter:	Filter messages by links, charts, videos,
                    or top. (Optional)

        Return:
            raw_json (dict) = The JSON output unparsed

        '''

        url = self.url + 'streams/symbol/' + symbol_id + '.json'
        print("url to get msgs:" +url)
        data = {
                 'since': '{}'.format(since),
                 'max': '{}'.format(max),
                 'limit': '{}'.format(limit),
                 # Fix when you figure out what this is
                 # 'callback' : '{}'.format(None),
                 'filter': '{}'.format(filter)
                }

        r = requests.get(url, headers=self.headers, params=data)
        if r.status_code != 200:
            raise Exception('Unable to Return Request {}'
                            .format(r.status_code))

        raw_json = r.json()
        return raw_json

    def get_specified_conversation_msgs(self, conversation_id, since=0, max=0, limit=0, callback=None):

        '''

        Args:
            conversation_id:	The message ID of the parent message
                                to a conversation. (Required)
            since:	Returns results with an ID greater than (more recent than)
                    the specified ID.
            max:	Returns results with an ID less than (older than) or equal
                    to the specified ID.
            limit:	Default and max limit is 30. This limit must be a
                    number under 30.
            callback:	Define your own callback function name, add this
                        parameter as the value.

        Return:
            raw_json (dict) = The JSON output unparsed

        '''

        url = self.url + 'streams/conversation/' + conversation_id + '.json'

        data = {
                 'since': '{}'.format(since),
                 'max': '{}'.format(max),
                 'limit': '{}'.format(limit)
                 # Fix when you figure out what this is
                 # 'callback' : '{}'.format(None),
                }

        r = requests.get(url, headers=self.headers, params=data)
        if r.status_code != 200:
            raise Exception('Unable to Return Request {}'
                            .format(r.status_code))

        raw_json = r.json()
        return raw_json


In [None]:
"""The class for collecting twits of Stocktwits

    A collection of methods to simplify your downloading
"""
import sys
import warnings
from contextlib import contextmanager
from io import StringIO
import json
from datetime import datetime, timedelta
#import stockTwitFetchAPI.stocktwitapi as st

class Collector():
    ts = None
    def __init__(self):
        """
        the core of API is the package stockTwitFetchAPI with the class twitStreamer
        """
        self.ts = twitStreamer()

    @contextmanager
    def hold_output(self):
        """
        hold output

        This method is temporary until PR approval:
        https://github.com/p-hiroshige/stockTwitsAPI/pull/1

            Example:
                with hold_output() as (out, err):
                    method_with_a_print()
                captured_output = out.getvalue().strip()

        """
        new_out, new_err = StringIO(), StringIO()
        old_out, old_err = sys.stdout, sys.stderr
        try:
            sys.stdout, sys.stderr = new_out, new_err
            yield sys.stdout, sys.stderr
        finally:
            sys.stdout, sys.stderr = old_out, old_err

    def there_is_symbol(self, symbols_fetched, symbols_target):
        """
        check if in the message there are the symbols target

            Arguments:
                :symbols_fetched (list of dict): list of symbols
                :symbols_target (list of string): list of symbols names
            Returns:
                a boolean, True if there is at least one symbol of target in the symbols fetched
        """
        for symbol in symbols_fetched:
            if symbol["symbol"] in symbols_target:
                return True
        return False

    def clean_data(self, messages, event):
        """
        clean data

            Arguments:
                :messages (list of dict): list of messages
                :event (dict): dictionary fully described in save_history()
                    symbols (list of str): names of symbols to fetch
                    users (list of str): names of users to fetch
                    only_combo (bool): if True, fetches only messages of those symbols posted from those users
            Returns:
                list of unique dictionaries cleaned
        """
        # unique messages
        messages = list({ message["id"] : message for message in messages }.values())
        if "only_combo" in event and event["only_combo"] == True:
            if "symbols" in event and "users" in event:
                messages = list({ message["id"] : message for message in messages if self.there_is_symbol(message["symbols"], event["symbols"]) and message["user"]["username"] in event["users"] }.values())
        return messages

    def get_data(self, event):
        """
        get data from Stocktwits, default last 30 messages

            Arguments:
                :event (dict): dictionary fully described in save_history()
                    symbols (list of str): names of symbols to fetch
                    users (list of str): names of users to fetch
                    min (int): optional, min ID
                    max (int): optional, max ID
                    limit (int): optional, defalt 30 messages
            Returns:
                list of messages
        """
        messages = []

        if "min" not in event:
            event["min"] = 0

        if "max" not in event:
            event["max"] = 0

        if "limit" not in event:
            event["limit"] = 30

        if "users" in event:
            for user in event["users"]:
                response = self.ts.get_user_msgs(user_id=user, since=event["min"], max=event["max"], limit=event["limit"], callback=None, filter=None)
                messages.extend(response["messages"])

        if "symbols" in event:
            for symbol in event["symbols"]:
                with self.hold_output() as (out, err):
                    try:
                        response = self.ts.get_symbol_msgs(symbol_id=symbol, since=event["min"], max=event["max"], limit=event["limit"], callback=None, filter=None)
                    except Exception as error:
                        print(event)
                        raise Exception(error)
                    messages.extend(response["messages"])

        return self.clean_data(messages, event)

    def is_younger(self, first_date, second_date):
        """
        compare a date with a second date

            Argument:
                :first_date (str): datetime with format %Y-%m-%dT%H:%M:%SZ 
                :second_date (str): another date with format %Y-%m-%dT%H:%M:%SZ
            Returns:
                a boolean, True if first date is younger than second one
        """
        first = datetime.strptime(first_date, "%Y-%m-%dT%H:%M:%SZ")
        second = datetime.strptime(second_date, "%Y-%m-%dT%H:%M:%SZ")

        if first <= second:
            return True
        return False

    def is_same_chunk(self, first_date, second_date, chunk = "day"):
        """
        compare a date with a second date

            Argument:
                :first_date (str): datetime with format %Y-%m-%dT%H:%M:%SZ 
                :second_date (str): another date with format %Y-%m-%dT%H:%M:%SZ
                :chunk (str): day, week or month, default day
            Returns:
                a boolean, True if the dates are of the same chunk
        """
        first = datetime.strptime(first_date, "%Y-%m-%dT%H:%M:%SZ")
        second = datetime.strptime(second_date, "%Y-%m-%dT%H:%M:%SZ")
        is_same = False

        if chunk == "day":
            is_same = first.strftime("%Y-%m-%d") == second.strftime("%Y-%m-%d")
        if chunk == "week":
            is_same = first.strftime("%W") == second.strftime("%W")
        if chunk == "month":
            is_same = first.replace(day=1).strftime("%Y-%m-%d") == second.replace(day=1).strftime("%Y-%m-%d")

        return is_same

    def get_cursor(self, messages):
        """
        get cursor with oldest date, min ID and max ID

            Arguments:
                :messages (list[dict]): list of messages
            Returns:
                a dictionary with oldest_date, min ID, earliest_date and max (ID)
        """
        return {
            "oldest_date": messages[-1]["created_at"],
            "min": messages[-1]["id"],
            "earliest_date": messages[0]["created_at"],
            "max": messages[0]["id"]
        }

    def clean_history(self, cursor, history, chunk = "day"):
        """
        clean history from messages with different chunk

            Arguments:
                :cursor (dict): dictionary with the keys oldest_date, min ID, earliest_date and max (ID)
                :history (list[dict]): list of messages
                :chunk (str): day, week or month, default day
            Returns:
                history cleaned
        """
        history_length = 0
        same_oldest_date = 0
        same_earliest_date = 0

        for message in history:
            history_length += 1
            if self.is_same_chunk(message["created_at"], cursor["oldest_date"], chunk):
                same_oldest_date += 1
            if self.is_same_chunk(message["created_at"], cursor["earliest_date"], chunk):
                same_earliest_date += 1

        if history_length == (same_oldest_date + same_earliest_date):
            current_chunk = cursor["oldest_date"]
            indexes_to_delete = []
            for index, message in enumerate(history):
                if self.is_same_chunk(message["created_at"], current_chunk):
                    indexes_to_delete.append(index)
            for index in reversed(indexes_to_delete):
                del history[index]
        elif not history_length == same_oldest_date and not history_length == same_earliest_date:
            warnings.warn(f"method clean_history, messages amount: {history_length}, {same_oldest_date} messages of {cursor['oldest_date']} and {same_earliest_date} messages of {cursor['earliest_date']}")

        return history

    def walk(self, event, cursor, history):
        """
        walk along the messages like a shrimp 

            Arguments:
                :event (dict): dictionary fully described in save_history()
                :cursor (dict): dictionary with the keys oldest_date, min ID, earliest_date and max (ID)
                :history (list[dict]): list of messages
            Returns:
                cursor, history
        """
        event["max"] = cursor["min"]
        messages = self.get_data(event)
        cursor = self.get_cursor(messages)
        chunk = event["chunk"] if "chunk" in event else "day"

        if not self.is_same_chunk(cursor["oldest_date"], cursor["earliest_date"], chunk):
            messages = self.clean_history(cursor, messages, chunk)
            cursor = self.get_cursor(messages)
        history.extend(messages)

        return cursor, history

    def get_history(self, event):
        """
        get history from Stocktwist, default last 30 messages

            Arguments:
                :event (dict): dictionary fully described in save_history()
                    start (datetime): optional, min datetime
                    is_verbose (bool): optional, if True comments will be printed
            Returns:
                list of messages
        """
        history = []
        messages = self.get_data(event)
        history.extend(messages)

        if "start" in event:
            cursor = self.get_cursor(messages)
            while self.is_younger(event["start"], cursor["oldest_date"]) and not event["start"] == cursor["oldest_date"]:
                if "is_verbose" in event and event["is_verbose"] is True:
                    print(f"method get_history, start: {event['start']}, cursor: {cursor['oldest_date']}")
                cursor, history = self.walk(event, cursor, history)
        # elif "min" in event and event["min"] > 0:
        #     cursor = self.get_cursor(messages)
        #     while event["min"] < cursor["min"]:
        #         cursor, history = self.walk(event, cursor, history)

        return history

    def get_date(self, chunk = "day", date = None, jump_chunk = False):
        """
        get date at midnight about chunk

            Arguments:
                :chunk (str): day, week or month, default day
                :date (str): datetime with format %Y-%m-%dT%H:%M:%SZ
                :jump_chunk (bool): True if you want to jump one chunk
            Returns:
                string of date at midnight about that chunk or next one
        """
        if date is None:
            current = datetime.now()
        else:
            current = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
        jump = 1
        start = current
        if chunk == "week":
            jump = 7
            start = current - timedelta(days=current.weekday())
        elif chunk == "month":
            jump = 0
            start = current.replace(day=1)
        if jump_chunk == True:
            start = start - timedelta(days=jump)
            if chunk == "month":
                month = start.month - 1
                year = start.year
                if month == 0:
                    month = 12
                    year = start.year - 1
                start = start.replace(month=month, year=year)
        return start.strftime("%Y-%m-%dT00:00:00Z")

    def update_event(self, key, value, event):
        """
        update a specific key of event

            Arguments:
                :key (str): attribute name of event
                :value (mix): value you want to replace on that key
                :event (dict): dictionary fully described in save_history()
            Returns:
                dictionary with the attribute named key changed with value
        """
        chunk = {}
        for k in event.keys():
            chunk[k] = event[k]
        chunk[key] = value
        return chunk

    def get_temporary_event(self, messages, current_chunk, event):
        """
        get temporary chunk event from messages

            Arguments:
                :messages (list[dict]): list of messages
                :current_chunk (dict): dictionary fully described in save_history()
                :event (dict): dictionary fully described in save_history()
            Returns:
                the temporary chunk event updated with the partial start and new min
        """
        cursor = self.get_cursor(messages)
        oldest_date = self.get_date(event["chunk"], cursor["oldest_date"])
        if event["start"] == oldest_date:
            oldest_date = self.get_date(event["chunk"], oldest_date, True)
        next_chunk = self.update_event("start", oldest_date, current_chunk)
        if self.is_younger(next_chunk["start"], event["start"]):
            if event["start"] == current_chunk["start"]:
                return next_chunk
            else:
                next_chunk["start"] = event["start"]
        next_chunk["max"] = cursor["min"]
        return next_chunk

    def get_file_name(self, history, current_chunk, event):
        """
        get filename

           Arguments:
                :history (list[dict]): list of messages
                :current_chunk (dict): dictionary like event
                :event (dict): dictionary fully described in save_history()
            Returns:
                the file name
        """
        chunk = datetime.strptime(current_chunk["start"], "%Y-%m-%dT%H:%M:%SZ")
        next_chunk = self.get_temporary_event(history, current_chunk, event)
        cursor = self.get_cursor(history)
        if self.get_date(event["chunk"], next_chunk["start"]) == self.get_date(event["chunk"], cursor["earliest_date"]):
            chunk = datetime.strptime(next_chunk["start"], "%Y-%m-%dT%H:%M:%SZ")
        return f'{event["filename_prefix"]}{chunk.strftime("%Y%m%d")}{event["filename_suffix"]}'

    def save_data(self, history, current_chunk, event):
        """
        save data

            Arguments:
                :history (list[dict]): list of messages
                :current_chunk (dict): dictionary like event
                :event (dict): dictionary fully described in save_history()
            Returns:
                the temporary chunk event updated with the partial start and new max
        """
        filename = self.get_file_name(history, current_chunk, event)
        next_chunk = self.get_temporary_event(history, current_chunk, event)
        cursor = self.get_cursor(history)
        if not self.is_same_chunk(cursor["oldest_date"], cursor["earliest_date"], event["chunk"]):
            history = self.clean_history(cursor, history, event["chunk"])
            cursor = self.get_cursor(history)
            next_chunk["max"] = cursor["min"]
        with open(filename, "w") as fh:
            json.dump(history, fh)
        return next_chunk

    def save_history(self, event):
        """
        save history from Stocktwist on files splitted by chunk per day, week or month

            Arguments:
                :event (dict):
                    symbols (list[str]): names of symbols to fetch
                    users (list[str]): names of users to fetch
                    only_combo (bool): optional, if True, fetches only messages of those symbols posted from those users
                    min (int): optional, min ID
                    max (int): optional, max ID
                    limit (int): optional, default 30 messages
                    start (str): optional, min datetime
                    chunk (str): optional (day, week or month), default day
                    filename_prefix (str): optional, default "history."
                    filename_suffix (str): optional, default ".json"
                    is_verbose (bool): optional, if True comments will be printed
            Returns:
                last temporary chunk event discarded
        """
        history = []
        if "chunk" not in event:
            event["chunk"] = "day"

        if "start" not in event:
            event["start"] = self.get_date(event["chunk"])

        if "filename_prefix" not in event:
            event["filename_prefix"] = "history."

        if "filename_suffix" not in event:
            event["filename_suffix"] = ".json"

        messages = self.get_data(event)
        history.extend(messages)
        cursor = self.get_cursor(messages)
        oldest_date = self.get_date(event["chunk"], cursor["oldest_date"])
        chunk = self.update_event("start", oldest_date, event)

        if "start" in event:
            if "is_verbose" in event and event["is_verbose"] is True:
                print(f"method save_history, start: {event['start']}, cursor: {cursor['oldest_date']}, next chunk: {chunk['start']}")
            while self.is_younger(event["start"], chunk["start"]):
                history = self.get_history(chunk)
                chunk = self.save_data(history, chunk, event)
                if "is_verbose" in event and event["is_verbose"] is True:
                    print(f"method save_history, start: {event['start']}, cursor: {cursor['oldest_date']}, next chunk: {chunk['start']}")
                history = []
        # elif "min" in event and event["min"] > 0:
        #     while event["min"] < cursor["min"]:
        #         history = self.get_history(chunk)
        #         chunk = self.save_data(history, chunk, event)
        #         history = []

        return chunk

# Data Cleaning

In [None]:
# The cleaning function is defined as follows:
import regex
def remove_wallets(text): 
  # print(text)
  return ' '.join(word for word in str(text).split() if len(word)<40)
def clean_df(df):
  df = df.drop_duplicates()
  df = df[df.notnull()]
  df = df.replace(r'[\u4e00-\u9fff]+', '', regex = True) # Remove all common asian letters
  df = df.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
  df = df.replace(r'[@][A-Za-z0-9_]+', '', regex=True).replace(r'[#][A-Za-z0-9_]+', '', regex=True).replace(r'[$][A-Za-z0-9_ ]+', '', regex=True).replace(r'[/][A-Za-z0-9_ ]+', '', regex=True)
  df = df.replace(r'RT : ', '', regex=True)
  df = df.replace(r'&amp;', 'and', regex=True).replace(r'&amp', 'and', regex=True)
  df = df.replace(r'â€™', '\'', regex=True).replace(r'&#39;', '\'', regex=True).replace(r'&#x200B;', '', regex=True).replace(r'&;', '\'', regex=True)
  df = df.replace(r'\.X', '', regex=True).replace(r'\.x', '', regex=True)
  df = df.replace(r'  ', ' ', regex=True).replace(r'   ', ' ', regex=True).replace(r'    ', ' ', regex=True)
  df = df.replace(r'@', '', regex=True)
  df = df.replace(r' \| ', '', regex = True).replace(r'\|', '', regex = True)
  df = df.replace(r'\.\.+', "...", regex=True)
  df = df.str.lower()
  df = df.replace(r'&quot;', '', regex=True)
  df = df.apply(lambda x: remove_wallets(x))
  df = df.drop_duplicates()
  df = df[df.str.split().str.len().ge(4)] # Remove all the posts shorter than 4 words 
  return df