# Identifying Precise Forecasters on r/Wallstreetbets
**BrainStation Data Science Bootcamp - Capstone Project**

**Author: L Gavrilova**

**Date: 6 November 2023**

# Notebook 2 - Labelled dataset - Text Cleaning 

## 2.0. Table of Contents

1. [Introduction](#1.-Introduction)

Removing website links <br>
Filtering out emojis by creating a new column <br>
2. Spellcheck.

. [Conclusion](#5.-Conclusion)

## 2.1. Cleaning Text

I am 

### 2.1.1. Data Loading and Basic Checks

In [1]:
# Standard Libraries for data manipulation
import pandas as pd
import numpy as np

# Regular Expressions Library
import re

# Emoji Handling Library
import emoji

In [2]:
df = pd.read_csv('../data/annotation file 3600 done 1142022.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5020 entries, 0 to 5019
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   link_id    5001 non-null   object
 1   parent_id  5001 non-null   object
 2   User       5001 non-null   object
 3   Text       5001 non-null   object
 4   Intent     5001 non-null   object
 5   Support    5001 non-null   object
dtypes: object(6)
memory usage: 235.4+ KB


In [4]:
df.sample(5)

Unnamed: 0,link_id,parent_id,User,Text,Intent,Support
2178,t3_l6u011,t3_l6u011,BestFill,I like the stock. GME holding $40k worth. \n\n...,y,y
3444,t3_l2ljpt,t3_l2ljpt,TenCity,BLESS RC BLESS GME BLESS WSB,u,y
4899,t3_kx3ja5,t3_kx3ja5,palmallamakarmafarma,GME struggles over 38.5,u,u
4507,t3_l6zvko,t3_l6zvko,YouLookLikeACuck,"LOL, earn your money like the rest of us, shou...",u,y
3923,t3_k9cx1r,t3_k9cx1r,CuckBike,WHAT STOCK DO I YOLO FOR TODAY\n\nLAZR? GME?,u,u


In [5]:
df.describe()

Unnamed: 0,link_id,parent_id,User,Text,Intent,Support
count,5001,5001,5001,5001,5001,5001
unique,1948,3153,4662,4952,6,4
top,t3_ladzdt,t3_ladzdt,AutoModerator,GME,u,y
freq,66,46,14,22,3246,2473


In [6]:
df['link_id'].nunique() == df.shape[0]

False

In [7]:
df.isna().sum()/df.shape[0]

link_id      0.003785
parent_id    0.003785
User         0.003785
Text         0.003785
Intent       0.003785
Support      0.003785
dtype: float64

In [8]:
df[ df['Text'].isna() ]

Unnamed: 0,link_id,parent_id,User,Text,Intent,Support
5001,,,,,,
5002,,,,,,
5003,,,,,,
5004,,,,,,
5005,,,,,,
5006,,,,,,
5007,,,,,,
5008,,,,,,
5009,,,,,,
5010,,,,,,


In [9]:
# Dropping rows that have NaN values
df = df.dropna()

In [10]:
df[ df['Text'].isna() ]

Unnamed: 0,link_id,parent_id,User,Text,Intent,Support


In [11]:
df['Intent'].value_counts()

Intent
u     3246
y      983
m      370
i      318
n       83
 u       1
Name: count, dtype: int64

In [12]:
# Replacing ' u' with 'u' in the 'Intent' column
df['Intent'] = df['Intent'].str.replace(' u', 'u', regex=False)
# checking again:
value_counts = df['Intent'].value_counts() 
print(value_counts)

Intent
u    3247
y     983
m     370
i     318
n      83
Name: count, dtype: int64


In [13]:
df_clean = df.copy() 

In [14]:
# Function to clean text
def purge_content(text):
    text_without_urls = re.sub(r'https?://\S+|www\.\S+', '', text)
    text_without_hashtags = re.sub(r'#\S+', '', text_without_urls)
    text_without_mentions = re.sub(r'@\S+', '', text_without_hashtags)
    clean_text = re.sub(r'\n+', ' ', text_without_mentions)

    return clean_text

for i in range(len(df_clean['Text'])):
    df_clean['Text'][i] = purge_content(df_clean['Text'][i])

In [15]:
pd.set_option('display.max_colwidth', None)

In [16]:
# sanity check
df_clean[df_clean['Text'] == '']

Unnamed: 0,link_id,parent_id,User,Text,Intent,Support
402,t3_l66caa,t1_gkyxvml,EllipticalOrbitMan,,i,i
1264,t3_l0mc06,t1_gju8jei,wolfiasty,,i,i
2187,t3_l6kqyk,t1_gl17oj6,EconomicallyLiterate,,i,i
4157,t3_khq3x2,t1_ggo6fi4,JonBoy82,,i,i
4265,t3_lat43j,t1_glq69gz,Free_Joty,,i,i


In [17]:
# Drop rows where the 'Text' column is an empty string
df_clean = df_clean[df_clean['Text'] != '']

In [18]:
# Recording the cleaned dataset as a new csv file to be used in other notebooks:
# Save the DataFrame to a CSV file
df.to_csv('../data/labelled_dataset_cleaned.csv', index=False)

### 2.1.2. Filtering out `emojis` by creating a new column

In [19]:
# Function to map emojis to their descriptions
def emoji_description(emoji):
    emoji_map = {
        "🚀": " super optimistic, ",
        "🦍": " brotherhood, ",
        "🤞": " hope, ",
        "🌙": " very optimistic, ",
        "🌕": " very optimistic, ",
        "💎🤚🏼": " patient investors, ",
        "💎🖐": " patient investors, ",
        "💎🙌": " patient investors, ",
        "🙌": " patient investors, ",
        "💎": " patient investors, ",
        "🧻🤚🏼": " impatient investors, ",
        "🧻🖐": " impatient investors, ",
        # Add more mappings as needed
    }
    # If the full emoji is in the map, return the description
    if emoji in emoji_map:
        return emoji_map[emoji]
    # If not, split any combined emojis and look up their individual descriptions
    else:
        return ''.join([emoji_map.get(char, '') for char in emoji])  # Default to empty string if not in mapping

def extract_and_replace_emojis(df, text_column_name='Text', emoji_column_name='emoji_text'):
    # Initialize an empty column for extracted emojis if a column name is provided
    if emoji_column_name:
        df[emoji_column_name] = ''

    # Function to extract and replace emojis in a text
    def process_text(text):
        emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001F004]+')

        # Find all emojis in the text using the regex pattern
        emoji_matches = emoji_pattern.findall(text)
        emojis_extracted = ''
        text_with_replaced_emojis = text

        # Iterate over the found emojis
        for emoji_str in emoji_matches:
            # For each emoji in the emoji string
            for emoji_char in emoji_str:
                emoji_desc = emoji_description(emoji_char)  # Get description for individual emoji
                text_with_replaced_emojis = text_with_replaced_emojis.replace(emoji_char, emoji_desc, 1)
                emojis_extracted += emoji_char + ' '  # Add space to separate emojis

        # Return the modified text and the extracted emojis
        return text_with_replaced_emojis, emojis_extracted.strip()

    # Apply the processing function to the specified column and create new columns for text and emojis
    result = df[text_column_name].apply(process_text)
    df[text_column_name] = result.apply(lambda x: x[0])
    
    if emoji_column_name:
        df[emoji_column_name] = result.apply(lambda x: x[1])

    return df

In [20]:
# Applying the function to extract and replace emojis from 'Text' column
df_clean = extract_and_replace_emojis(df_clean, text_column_name='Text', emoji_column_name='emoji_text')

In [21]:
# Checking the new column with emojis extracted from the text
df_clean.sample().T
df_clean['emoji_text'].value_counts()

emoji_text
                                                                     4254
🚀 🚀 🚀                                                                  66
🚀                                                                      50
🚀 🚀                                                                    28
🚀 🚀 🚀 🚀                                                                26
                                                                     ... 
🧻 🤚 🏼 🧻 🤚 🏼 🧻 🤚 🏼 🧻 🤚 🏼 💎 🤚 🏼 💎 🤚 🏼 💎 🤚 🏼 💎 🤚 🏼 💎 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀       1
💎 🚀 🚀 🌙                                                                 1
🚀 💪 🏋 💎                                                                 1
📝 👋 💎 👏 🚀 🚀 🚀 🌈 🐻 📉 🚀 🚀 🚀 🌕 🔥 🔥                                         1
🚀 🚀 🚀 🖐 💎 🖐 💵 🖐 🍿 🍗 🚀 🦍 🌚 🚀 🚀                                           1
Name: count, Length: 383, dtype: int64

In [22]:
df_clean.sample(3)

Unnamed: 0,link_id,parent_id,User,Text,Intent,Support,emoji_text
4434,t3_lisb55,t1_gn59vi0,TV_PartyTonight,&gt; even without a squeeze i see gme around 150 2 by end of year lmfao,u,y,
17,t3_kxsd2p,t3_kxsd2p,stevenpaperwork,"Someone explain to me why the fee is so high but there's more shares available this morning, are shorts giving up for today?",u,u,
4456,t3_l70sjp,t1_gl46zwe,GrowBeyond,didnt td block gme?,u,u,


### 2.1.3. Replacing slang with custom made "WSB Dictionary"

In [23]:
# Load the WSB lingo dictionary
wsb_dict_df = pd.read_csv('../data/WSB_dictionary.csv')

# Convert the DataFrame to a dictionary
wsb_dict = dict(zip(wsb_dict_df['WSB lingo'], wsb_dict_df['English']))

# Function to replace WSB lingo with English
def replace_wsb_lingo(text):
    # Use a regex pattern to match only whole words
    pattern = r'\b(' + '|'.join(re.escape(key) for key in wsb_dict.keys()) + r')\b'
    # Replace occurrences of each lingo with the English equivalent
    return re.sub(pattern, lambda x: wsb_dict[x.group()], text)

# Apply the function to the 'Text' column
df_clean['Text'] = df_clean['Text'].apply(replace_wsb_lingo)

### 2.1.4 Examples of texts before and after the cleaning steps

In [24]:
original_with_index = df.loc[2026]
print(original_with_index)

clean_with_index = df_clean.loc[2026]
print(clean_with_index)

link_id                                                                                                                                                                                                                                                                                                                                             t3_l6cb1x
parent_id                                                                                                                                                                                                                                                                                                                                           t3_l6cb1x
User                                                                                                                                                                                                                                                                                                        

In [25]:
original_with_index = df.loc[3986]
print(original_with_index)

clean_with_index = df_clean.loc[3986]
print(clean_with_index)

link_id                                            t3_l8ynt4
parent_id                                          t3_l8ynt4
User                                       wowexcellentstuff
Text         did NOT read. $GME to mf Andromeda 🚀🚀🚀🌌🌌\n\n💎🤲💎
Intent                                                     u
Support                                                    y
Name: 3986, dtype: object
link_id                                                                                                                                   t3_l8ynt4
parent_id                                                                                                                                 t3_l8ynt4
User                                                                                                                              wowexcellentstuff
Text          did NOT read. $GME to mf Andromeda  super optimistic,  super optimistic,  super optimistic,   patient investors,  patient investors, 
Intent          

In [26]:
original_with_index = df.loc[3386]
print(original_with_index)

clean_with_index = df_clean.loc[3386]
print(clean_with_index)

link_id                                    t3_kkwy50
parent_id                                  t3_kkwy50
User                                SnooMacarons1548
Text         GME🚀🚀🚀\n\nIt's a money printing company
Intent                                             u
Support                                            y
Name: 3386, dtype: object
link_id                                                                                        t3_kkwy50
parent_id                                                                                      t3_kkwy50
User                                                                                    SnooMacarons1548
Text          GME super optimistic,  super optimistic,  super optimistic,  It's a money printing company
Intent                                                                                                 u
Support                                                                                                y
emoji_text                

## 2.2. Conclusion

In [27]:
# Recording the cleaned dataset as a new csv file to be used in future:
# Save the DataFrame to a CSV file
df_clean.to_csv('../data/labelled_dataset_wo_emoji.csv', index=False)

# Optional 

In [28]:
# punctuation and anything except for letters is stripped away, also empty spaces go away. 

# Emojis are stripped off!!!! NB! 

if False:
    
    cleaned_df = df.copy()
    # to replace any character that is not a lowercase or uppercase letter with a single space
    # then to replace one or more whitespace characters (\s+) with a single space
    # then to replace '\n' with empty spaces
    # then to remove all types of whitespace characters at the ends of the string
    # cleaned_df["Text"] = cleaned_df["Text"].replace("\n", "").str.replace(r"[^a-zA-Z]", " ").str.replace(r"\s+", " ")


    # First, replace newline characters with an empty string for each element
    cleaned_df["Text"] = cleaned_df["Text"].str.replace("\n", "", regex=False)

    # Then, replace non-alphabetic characters with a space for each element
    cleaned_df["Text"] = cleaned_df["Text"].str.replace(r"[^a-zA-Z]", " ", regex=True)

    # Then, replace multiple spaces with a single space for each element
    cleaned_df["Text"] = cleaned_df["Text"].str.replace(r"\s+", " ", regex=True)

    # Finally, strip leading and trailing spaces from each element
    cleaned_df["Text"] = cleaned_df["Text"].str.strip()

    df=cleaned_df.copy()

In [30]:
# dfdfd

if False:
        df_clean = df.copy()

    # Function to clean text
    def purge_content(text):
        # Define patterns for URLs, hashtags, mentions, and newlines
        url_pattern = r'https?://\S+|www\.\S+'
        hashtag_pattern = r'#\S+'
        mention_pattern = r'@\S+'
        newline_pattern = r'\n+'
        
        # Remove URLs
        purged_text = re.sub(url_pattern, '', text)
        # Remove hashtags
        purged_text = re.sub(hashtag_pattern, '', purged_text)
        # Remove mentions
        purged_text = re.sub(mention_pattern, '', purged_text)
        # Remove newlines
        purged_text = re.sub(newline_pattern, ' ', purged_text)
        
        return purged_text

    # Clean the 'Text' column
    df_clean['Text'] = df_clean['Text'].apply(purge_content)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 7)