In [1]:
import numpy as np
import pandas as pd
import re
import string


# Contents
- Download datasets
- Data cleaning
    - Check labels contain only 0/1
    - Remove duplicates
    - Check number of NaN/empty cells
    - Remove / Normalize
        - HTML tags
        - links
        - URL
        - phone numbers
        - emojis using Emojis library
        - special unicode characters
        - repeated occurences of punctuations and whitespaces
        - short examples, where number of characters is smaller than 10
    - lowercase characters to decrease token library size
- Download cleaned dataframes as csv files

## Download datasets
- Reddit combi
- Reddit title
- Twitter full
- Twitter non-advert

In [2]:
reddit_combi_df = pd.read_csv(
    "data/Reddit_Combi.csv",
    sep=';',    
    )

In [3]:
print("Number of rows: ", len(reddit_combi_df))
reddit_combi_df.head()

Number of rows:  3123


Unnamed: 0,title,body,Body_Title,label
0,Envy to other is swallowing me,"Im from developingcountry, Indonesia , and for...",Envy to other is swallowing me Im from develop...,1
1,Nothin outta the ordinary. Paradise. Job stres...,Um hello ....well many can relate im sure. Aft...,Nothin outta the ordinary. Paradise. Job stres...,1
2,Almost 49 and the chasm of emptiness has never...,I’ve been diagnosed severe bi polar where you ...,Almost 49 and the chasm of emptiness has never...,1
3,I’m happy again,"After my closest friend left me in April, I ha...",I’m happy again After my closest friend left m...,0
4,Is it possible to recover from such a traumati...,"I am only 15, and yet I feel my life is alread...",Is it possible to recover from such a traumati...,1


In [4]:
reddit_combi_df.describe()

Unnamed: 0,label
count,3123.0
mean,0.878963
std,0.326223
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [5]:
reddit_title_df = pd.read_csv(    
    "data/Reddit_Title.csv",    
    sep=';',    
)

In [6]:
print("Number of rows: ", len(reddit_title_df))
reddit_title_df.head()

Number of rows:  5556


Unnamed: 0,title,label
0,My aunt and uncle scoring their first gig as p...,0
1,How do I stop stressing about work when I'm at...,1
2,Meeting a fellow suicidal student in middle sc...,1
3,My brain feels literally numb. Is this depress...,1
4,A mother's reaction after seeing her son has p...,0


In [7]:
reddit_title_df.describe()

Unnamed: 0,label
count,5556.0
mean,0.49406
std,0.50001
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [8]:
twitter_full_df = pd.read_csv(
    "data/Twitter_Full.csv",
    sep=';',    
    )

In [9]:
print("Number of rows: ", len(twitter_full_df))
twitter_full_df.head()

Number of rows:  8900


Unnamed: 0,text,hashtags,labels
0,Being s mom is cleaning 24/7 the same shit ove...,"['momlife', 'kids', 'tired']",1
1,And now we have been given the walkthru book b...,['walkthru'],0
2,Wishing YOU Peace Joy & Love! JoyTrain MentalH...,"['Peace', 'Joy', 'Love', 'JoyTrain', 'MentalHe...",0
3,speak-no-evil monkey Can I Be Honest With You...,"['therapy', 'help', 'NLP', 'CBT', 'hypnotherap...",1
4,Psy Do u hv any regrets? Me No Psy Are you hap...,[],0


In [10]:
twitter_full_df.describe()

Unnamed: 0,labels
count,8900.0
mean,0.509438
std,0.499939
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Rename column labels to match other dataframes' label column

In [11]:
twitter_full_df = twitter_full_df.rename(columns={"labels": "label"})
twitter_full_df.head()

Unnamed: 0,text,hashtags,label
0,Being s mom is cleaning 24/7 the same shit ove...,"['momlife', 'kids', 'tired']",1
1,And now we have been given the walkthru book b...,['walkthru'],0
2,Wishing YOU Peace Joy & Love! JoyTrain MentalH...,"['Peace', 'Joy', 'Love', 'JoyTrain', 'MentalHe...",0
3,speak-no-evil monkey Can I Be Honest With You...,"['therapy', 'help', 'NLP', 'CBT', 'hypnotherap...",1
4,Psy Do u hv any regrets? Me No Psy Are you hap...,[],0


In [12]:
twitter_non_advert = pd.read_csv(
    "data/Twitter_Non-Advert.csv",
    sep=';',    
    )

In [13]:
print("Number of rows: ", len(twitter_non_advert))
twitter_non_advert.head()

Number of rows:  2051


Unnamed: 0,text,label
0,speak-no-evil monkey Can I Be Honest With You...,1
1,Frau Goebbels early signs of psychosis psychot...,1
2,A lot of work and unfulfilled tasks plunge you...,1
3,Private health insurance delivers value for yo...,1
4,XpertOnline offers you the convenience of view...,1


In [14]:
twitter_non_advert.describe()

Unnamed: 0,label
count,2051.0
mean,0.618235
std,0.485938
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [15]:
# Collect all dataframes into list
df_list = [reddit_combi_df, reddit_title_df, twitter_full_df, twitter_non_advert]
df_names = ["Reddit combi", "Reddit title", "Twitter full", "Twitter non advert"]

## Data cleaning

### Check that label column contains only 0 or 1

In [16]:
for i in range(len(df_list)):
    count = (~np.isin(df_list[i]['label'], [0, 1])).sum()
    print(df_names[i], "Number of invalid labels: ", count, "\n")

Reddit combi Number of invalid labels:  0 

Reddit title Number of invalid labels:  0 

Twitter full Number of invalid labels:  0 

Twitter non advert Number of invalid labels:  0 



### Remove duplicates

In [17]:
for i in range(len(df_list)):
    old_len = len(df_list[i])
    df_list[i] = df_list[i].drop_duplicates()
    new_len = len(df_list[i])
    if new_len < old_len:
        print(df_names[i], "had ", old_len - new_len, "duplicates")

Reddit title had  24 duplicates
Twitter full had  375 duplicates
Twitter non advert had  79 duplicates


### Check number of rows containing NaN or are empty

In [18]:
for i in range(len(df_list)):
    empty_or_nan = df_list[i].isnull() | df_list[i].apply(lambda col: col.astype(str).str.strip().eq(''))
    count = empty_or_nan.sum()
    print(df_names[i], "NaN or empty examples: ", count, "\n")

Reddit combi NaN or empty examples:  title         0
body          7
Body_Title    0
label         0
dtype: int64 

Reddit title NaN or empty examples:  title    0
label    0
dtype: int64 

Twitter full NaN or empty examples:  text        0
hashtags    6
label       0
dtype: int64 

Twitter non advert NaN or empty examples:  text     0
label    0
dtype: int64 



Number of NaN or empty examples is low for Reddit combi and Twitter full and zero to others. Print rows containing NaN or empty column to see if they contain enough information to be condidered as a sample.

In [19]:
mask = reddit_combi_df.isnull() | reddit_combi_df.apply(lambda col: col.astype(str).str.strip().eq(''))
rows_with_empty_or_nan = reddit_combi_df[mask.any(axis=1)]
print(rows_with_empty_or_nan)

                                                  title body  \
86    Dealing With A Stressful Situation When All Alone  NaN   
215   Everyone must get off their screens and indulg...  NaN   
518   In order to deal with stress, it's important t...  NaN   
540   Recognizing your own self-worth exists outside...  NaN   
847   My depression is giving me negative thoughts a...  NaN   
1755  I got caught doing something wired by my sibli...  NaN   
2557  Taking a Stab on defining "Being Realistic vs ...  NaN   

                                             Body_Title  label  
86    Dealing With A Stressful Situation When All Al...      1  
215   Everyone must get off their screens and indulg...      0  
518   In order to deal with stress, it's important t...      1  
540   Recognizing your own self-worth exists outside...      0  
847   My depression is giving me negative thoughts a...      1  
1755  I got caught doing something wired by my sibli...      1  
2557  Taking a Stab on defining 

In [20]:
mask = twitter_full_df.isnull() | twitter_full_df.apply(lambda col: col.astype(str).str.strip().eq(''))
rows_with_empty_or_nan = twitter_full_df[mask.any(axis=1)]
print(rows_with_empty_or_nan)

                                                   text hashtags  label
663                   ['stress', 'studying', 'anxiety']      NaN      1
1508                               nature world arbaz73      NaN      1
2678                               nature world arbaz73      NaN      1
4602  ['nature world arbaz73', 'arbaz 73', 'nature',...      NaN      0
4607  DidYouKnow that laughing lowers levels of stre...      NaN      0
4791                Is Your Stress Bothering You Today?      NaN      1
7292  ['nature world arbaz73', 'arbaz 73', 'nature',...      NaN      0
8757  ['DidYouKnow', 'laughing', 'stress', 'ImmuneSy...      NaN      0


The rows containing Nan have enough information, so keep those examples without modifying.

## Remove/Normalize following 
- HTML tags
- links
- URL
- phone numbers
- emojis
- special unicode characters
- repeated occurences of punctuations and whitespaces
- short examples, where number of characters is smaller than 10
- lowercase characters to decrease token library size


### HTML tags

In [None]:
# See how many rows contain HTML tag

# HTML tag must start with < and end with > and have at least one character (not >) inside it
html_tags = re.compile(r'<[^>]+>')
for i in range(len(df_list)):
    count = df_list[i].apply(lambda row: row.astype(str).str.contains(html_tags).any(), axis=1).sum()
    print(df_names[i], "contains: ", count, "HTML tags\n")


Reddit combi contains:  0 HTML tags

Reddit title contains:  0 HTML tags

Twitter full contains:  0 HTML tags

Twitter non advert contains:  0 HTML tags



No HTML tages to clear

### URLs

In [22]:
# See how many rows contain URLs

# Assume URL must contain https:// or http://
url = re.compile(r'https?://[^\s"]+')

for i in range(len(df_list)):
    count = df_list[i].apply(lambda row: row.astype(str).str.contains(url).any(), axis=1).sum()
    print(df_names[i], "contains: ", count, "URLs\n")

Reddit combi contains:  0 URLs

Reddit title contains:  0 URLs

Twitter full contains:  0 URLs

Twitter non advert contains:  0 URLs



No URLs to remove

### Phone numbers

In [None]:
# See how many rows contain phone numbers

# Regex is taken from: https://www.geeksforgeeks.org/dsa/validate-phone-numbers-with-country-code-extension-using-regular-expression/
# edited it, so it doesn't require + at start 

phone = re.compile(r'^[+]?(?:[0-9\-\(\)\/\.]\s?){6,15}[0-9]$')

for i in range(len(df_list)):
    count = df_list[i].apply(lambda row: row.astype(str).str.contains(phone).any(), axis=1).sum()
    print(df_names[i], "contains: ", count, "phone numbers\n")

Reddit combi contains:  0 phone numbers

Reddit title contains:  0 phone numbers

Twitter full contains:  0 phone numbers

Twitter non advert contains:  0 phone numbers



No phone numbers to remove

### Convert emojis using Emoji library

In [24]:
import emoji

In [25]:
for i in range(len(df_list)):
    count = 0
    for col in df_list[i].columns:
        if col != "label":
            count += df_list[i][col].map(lambda x: len(emoji.emoji_list(str(x)))).sum()
            df_list[i][col] = df_list[i][col].map(lambda x: emoji.demojize(str(x)))
    print(df_names[i], "contains:", count, "emojis")

Reddit combi contains: 0 emojis


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[i][col] = df_list[i][col].map(lambda x: emoji.demojize(str(x)))


Reddit title contains: 0 emojis
Twitter full contains: 6 emojis
Twitter non advert contains: 0 emojis


### Special unicode characters

In [26]:
# See how many rows contain special unicde characters

# find non-ascii characters and non-emoji 
special = re.compile(r'[\u200a\u200b\u202f\u2060\u2063\u2066\u2069\ufeff]')

for i in range(len(df_list)):
    count = df_list[i].apply(lambda row: row.astype(str).str.contains(special).any(), axis=1).sum()
    print(df_names[i], "contains: ", count, "special unicode characters\n")

Reddit combi contains:  63 special unicode characters

Reddit title contains:  0 special unicode characters

Twitter full contains:  43 special unicode characters

Twitter non advert contains:  6 special unicode characters



There are quite many special unicode characters. Next, see what they look like. 

In [27]:
special_cases = {}
for i in range(len(df_list)):
    
    # see special characters
    row_special_chars = []
    for row in df_list[i].astype(str).itertuples(index=False):
        for cell in row:
            matches = special.findall(cell)
            row_special_chars.extend(matches)

    special_cases[df_names[i]] = row_special_chars
    print(f"{df_names[i]}: {len(row_special_chars)} special unicode characters (excluding emojis)")

    # Remove special chars
    df_list[i] = df_list[i].map(lambda x: special.sub('', str(x)))

for name, chars in special_cases.items():
    print(f"\n{name} unique special characters:")
    print(set(chars))



Reddit combi: 298 special unicode characters (excluding emojis)
Reddit title: 0 special unicode characters (excluding emojis)
Twitter full: 146 special unicode characters (excluding emojis)
Twitter non advert: 9 special unicode characters (excluding emojis)

Reddit combi unique special characters:
{'\u200b', '\ufeff'}

Reddit title unique special characters:
set()

Twitter full unique special characters:
{'\u200b', '\u2060', '\u2063', '\u202f', '\u2066', '\u2069', '\u200a'}

Twitter non advert unique special characters:
{'\u2060', '\u2063', '\u202f', '\u2066', '\u2069'}


### Repeated occurences of punctuations and whitespaces

In [28]:
# See how many

import warnings
# punctuations include more than one character of !, ? ., , :, ; in a row
punc = re.compile(r'([!?.,:;])\1+')

# whitespaces include all whitespaces
whitespace = re.compile(r'\s+')

# ignore unexpected warning
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for i in range(len(df_list)):
        count = df_list[i].apply(lambda row: row.astype(str).str.contains(punc).any(), axis=1).sum()
        print(df_names[i], "contains: ", count, "special unicode characters\n")

for i in range(len(df_list)):
    count = df_list[i].apply(lambda row: row.astype(str).str.contains(whitespace).any(), axis=1).sum()
    print(df_names[i], "contains: ", count, "whitespaces\n")



Reddit combi contains:  615 special unicode characters

Reddit title contains:  327 special unicode characters

Twitter full contains:  911 special unicode characters

Twitter non advert contains:  224 special unicode characters

Reddit combi contains:  3123 whitespaces

Reddit title contains:  5466 whitespaces

Twitter full contains:  8525 whitespaces

Twitter non advert contains:  1972 whitespaces



In [29]:
# Find all punctuation and whitespace characters
punc_d = {}
white = {}
for i in range(len(df_list)):
    
    row_punc = []
    row_white = []
    for row in df_list[i].astype(str).itertuples(index=False):
        for cell in row:
            matches = punc.findall(cell)
            row_punc.extend(matches)

            matches2 = whitespace.findall(cell)
            row_white.extend(matches2)

    punc_d[df_names[i]] = row_punc
    print(f"{df_names[i]}: {len(row_punc)} punctuations")

    white[df_names[i]] = row_white
    print(f"{df_names[i]}: {len(row_white)} whitespaces")

for name, chars in punc_d.items():
    print(f"\n{name} punctuations:")
    print(set(chars))

for name, chars in white.items():
    print(f"\n{name} whitespaces:")
    print(set(chars))



Reddit combi: 2684 punctuations
Reddit combi: 1088513 whitespaces
Reddit title: 370 punctuations
Reddit title: 93369 whitespaces
Twitter full: 1116 punctuations
Twitter full: 259526 whitespaces
Twitter non advert: 255 punctuations
Twitter non advert: 49671 whitespaces

Reddit combi punctuations:
{',', '!', '?', '.'}

Reddit title punctuations:
{'!', '?', '.'}

Twitter full punctuations:
{',', '!', '?', '.'}

Twitter non advert punctuations:
{',', '!', '?', '.'}

Reddit combi whitespaces:
{'  ', '\t\t ', ' ', '   ', '\xa0 ', '\xa0 \xa0 ', ' \xa0', '\xa0', '\xa0\xa0 ', '    '}

Reddit title whitespaces:
{' '}

Twitter full whitespaces:
{'  ', ' ', ' \xa0', '\xa0 ', '\xa0 \xa0 ', '\xa0\xa0', '   ', '\xa0\xa0\xa0 ', ' \xa0 ', '\xa0', '\xa0\xa0 ', '    '}

Twitter non advert whitespaces:
{'  ', ' ', '   ', '\xa0 ', '\xa0 \xa0 ', ' \xa0 ', '\xa0', ' \xa0'}


In [30]:
# Remove those above

for i in range(len(df_list)):
    # convert many characters to one
    df_list[i] = df_list[i].map(lambda x: punc.sub(r'\1', str(x))) 

    # convert all whitespaces to ' '
    df_list[i] = df_list[i].map(lambda x: whitespace.sub(' ', str(x)).strip())

Check that punctuations are removed, for example !! -> !

In [31]:
df_list[2].iloc[1, 0]

'And now we have been given the walkthru book by and to base our whole school PD on! grinning face instructionalcoaching excited'

### Examples with small number of characters (< 10)

In [32]:
threshold = 10

for i in range(len(df_list)):
    count = df_list[i].apply(lambda row: len(" ".join(row.astype(str))) < threshold, axis=1).sum()
    print(df_names[i], "contains: ", count, "rows with under 10 characters\n")

Reddit combi contains:  0 rows with under 10 characters

Reddit title contains:  52 rows with under 10 characters

Twitter full contains:  0 rows with under 10 characters

Twitter non advert contains:  0 rows with under 10 characters



Check are the 61 rows in Reddit title enough informaive to keep.

In [33]:
rows_to_drop = []
for idx, row in df_list[1].iterrows():
    row_text = " ".join(str(x) for x in row)  
    if len(row_text) < threshold:
        print(row)
        rows_to_drop.append(idx)

title    Past
label       1
Name: 130, dtype: object
title    Done
label       1
Name: 165, dtype: object
title    Advice
label         1
Name: 310, dtype: object
title    CTE?
label       1
Name: 407, dtype: object
title    Leaving
label          1
Name: 466, dtype: object
title    Help?
label        1
Name: 560, dtype: object
title    Whyyyyy
label          1
Name: 589, dtype: object
title    Hi
label     1
Name: 593, dtype: object
title    I can't
label          1
Name: 598, dtype: object
title    lonely
label         1
Name: 648, dtype: object
title    Alone
label        1
Name: 674, dtype: object
title    My day.
label          1
Name: 794, dtype: object
title    b
label    1
Name: 854, dtype: object
title    Anyone
label         1
Name: 996, dtype: object
title    idk
label      1
Name: 1181, dtype: object
title    Test
label       1
Name: 1454, dtype: object
title    fuck
label       1
Name: 1558, dtype: object
title    School
label         1
Name: 1730, dtype: object
title    s

The above examples are quite non-informative. Remove those from the dataframe.

In [34]:
df_list[1] = df_list[1].drop(rows_to_drop)
len(df_list[1])

5480

### Lowercase characters

In [35]:
for i in range(len(df_list)):
    for col in df_list[i].columns:
        if col != "label":  
            df_list[i][col] = df_list[i][col].map(lambda x: str(x).lower())


## Save cleaned dataframes as csv files

In [None]:

df_list[0].to_csv('Reddit_Combi_cleaned.csv', index=False)
df_list[1].to_csv('Reddit_Title_cleaned.csv', index=False)
df_list[2].to_csv('Twitter_Full_cleaned.csv', index=False)
df_list[3].to_csv('Twitter_Non-Advert_cleaned.csv', index=False)


"\ndf_list[0].to_csv('Reddit_Combi_cleaned.csv', index=False)\ndf_list[1].to_csv('Reddit_Title_cleaned.csv', index=False)\ndf_list[2].to_csv('Twitter_Full_cleaned.csv', index=False)\ndf_list[3].to_csv('Twitter_Non-Advert_cleaned.csv', index=False)\n"