# 3 Data Preprocessing for Comments data

<h3>Including two parts</h3>
<h4>3.1 Basic Processing</h4>
<h4>3.2 Split comments into sentences</h4>

<h2>3.1 Basic Processing</h2>

reference:
https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79


What we need to do is:

(1) Remove bot's comments and some useless comments, such as "[delete]"


(2) Remove some unecessary information: html url, punctuations

In [1]:
import os
import glob
import pandas as pd
import en_core_web_sm
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonewang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Load all the comments collected from Football Subreddit
dff1 = pd.read_csv("comments17-18.csv")
dff2 = pd.read_csv("comments18-19.csv")
dff3 = pd.read_csv("comments19-20.csv")
dff4 = pd.read_csv("comments16-17.csv")
dff5 = pd.read_csv("comments15-16.csv")
dff6 = pd.read_csv("comments14-15.csv")
dff7 = pd.read_csv("comments13-14.csv")
dff8 = pd.read_csv("comments12-13.csv")
dff9 = pd.read_csv("comments11-12.csv")
dff10 = pd.read_csv("comments10-11.csv")
dff11 = pd.read_csv("comments20-21.csv")
print(len(dff10))
print(len(dff9))
print(len(dff8))
print(len(dff7))
print(len(dff6))
print(len(dff5))
print(len(dff4))
print(len(dff1))
print(len(dff2))
print(len(dff3))
print(len(dff11))

70
1387
7599
11565
8401
5739
7773
7482
19364
44575
40721


In [29]:
# step 1: delete comments from AutoModerator and deleted comments
def delete_invalid_comments(df):
    df.drop(df[df["Author"] == "AutoModerator"].index, inplace=True)
    df.drop(df[(df["Author"] == "[deleted]") & (df["Comment"] == "[deleted]")].index, inplace=True)
    df.drop(df[(df["Author"] == "[deleted]") & (df["Comment"] == "[removed]")].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

dff1 = delete_invalid_comments(dff1)
dff2 = delete_invalid_comments(dff2)
dff3 = delete_invalid_comments(dff3)
dff4 = delete_invalid_comments(dff4)
dff5 = delete_invalid_comments(dff5)
dff6 = delete_invalid_comments(dff6)
dff7 = delete_invalid_comments(dff7)
dff8 = delete_invalid_comments(dff8)
dff9 = delete_invalid_comments(dff9)
dff10 = delete_invalid_comments(dff10)
dff11 = delete_invalid_comments(dff11)
dff11

Unnamed: 0,ComID,Comment,Author,Publish Date
0,g29wax1,I'd usually cheer for the underdog but having ...,WubOfDoom,2020-08-21 02:21:02
1,g29whx0,"ST isn't a real position, CF is the correct term.",WubOfDoom,2020-08-21 02:22:52
2,g29x7kv,Sule has played less than 90 minutes of footba...,WubOfDoom,2020-08-21 02:29:29
3,g29ydew,"I've watched my fair share of Porto, Benfica a...",Estartes2,2020-08-21 02:39:57
4,g2a15g7,"La Gantoise is fairly known actually, they pas...",47Yamaha,2020-08-21 03:04:35
...,...,...,...,...
32382,gz32mmm,DL has a history of being a dirty player. Defi...,PresidenteClint,2021-05-22 21:20:48
32383,gz32ycm,I think Raul scoring on Chelsea and seeing him...,LincolnCoHo,2021-05-22 21:23:38
32384,gz33yy9,"Yeah. I hope DL gets Zuñiga'd someday, just to...",PresidenteClint,2021-05-22 21:32:30
32385,gz34cfi,Primeira Liga--Europe's *real* 5th-best league.,PresidenteClint,2021-05-22 21:35:47


In [20]:
# step 2: combine comments dataset
# this is used for combining comments from different seasons
frames = [dff1, dff2, dff3, dff4, dff5, dff6, dff7, dff8, dff9, dff10, dff11]
final_df = pd.concat(frames)
final_df = final_df.reset_index(drop=True)
final_df.to_csv("NewFinalComments10-21.csv", index=False)
final_df = delete_invalid_comments(final_df)
final_df

Unnamed: 0,ComID,Comment,Author,Publish Date
0,dl4xjfx,The transfer that will give birth to many craz...,v-d-c,2017-08-04 02:41:47
1,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27
2,dl51noz,sock,vnfootball1,2017-08-04 04:07:19
3,dl53ej5,This is always the most ignorant argument made...,jesonnier,2017-08-04 04:45:50
4,dl562z6,A very sad day in the history of transfers. No...,404randomguy404,2017-08-04 05:49:08
...,...,...,...,...
130940,gz32mmm,DL has a history of being a dirty player. Defi...,PresidenteClint,2021-05-22 21:20:48
130941,gz32ycm,I think Raul scoring on Chelsea and seeing him...,LincolnCoHo,2021-05-22 21:23:38
130942,gz33yy9,"Yeah. I hope DL gets Zuñiga'd someday, just to...",PresidenteClint,2021-05-22 21:32:30
130943,gz34cfi,Primeira Liga--Europe's *real* 5th-best league.,PresidenteClint,2021-05-22 21:35:47


In [21]:
def strip_html_tags(df):
    """remove urls from comment"""
    procomments = []
    for i in range(df.shape[0]):
        soup = BeautifulSoup(str(df.iloc[i]["Comment"]), "html.parser")
        stripped_text = soup.get_text(separator=" ")
        procomments.append(stripped_text)
    df["ProComment"] = procomments
    return df

def remove_whitespace(df):
    """remove extra whitespaces from comment"""
    procomments = []
    for i in range(df.shape[0]):
        text = df.iloc[i]["ProComment"].strip()
        procomments.append(" ".join(text.split()))
    df["ProComment"] = procomments
    return df

def expand_contractions(df):
    """expand shortened words, e.g. don't to do not"""
    procomments = []
    for i in range(df.shape[0]):
        text = contractions.fix(df.iloc[i]["ProComment"])
        procomments.append(text)
    df["ProComment"] = procomments
    return df

# step 8: remove emojis
def remove_emojis(df):
    """remove emojis from comments"""
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    procomments = []
    for i in range(df.shape[0]):
        procomments.append(emoji_pattern.sub(r'', df.iloc[i]["ProComment"]))
    df["ProComment"] = procomments
    return df

In [22]:
# step 3: remove html tags, whitespaces, emojis and expand contractions
final_df = pd.read_csv("NewFinalComments10-21.csv")
final_df = delete_invalid_comments(final_df)
final_df = strip_html_tags(final_df)
final_df = remove_whitespace(final_df)
final_df = remove_emojis(final_df)
final_df = expand_contractions(final_df)

#choubirdz" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



Ibra!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


Hahahahahah" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



Almost" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.





https://www.youtube.com/watch?v=5upGH6dV0qA" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks lik

Confirmed." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.

?" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.




" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.




" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.

http://www.footytube.com" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



*Enjoy!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.

Story" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



https://www.google.com/amp/s/www.businessinsider.com/alex-morgan-ronaldo-rape-allegations-sports-illustrated-interview-2019-9%3famp?espv=1" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


https://images.app.goo.gl/3i5nHQ7WS6QpaeaV9" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



https://www.thatstatfootball.com/tempimg2" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.

https://podcasts.apple.com/gb/podcast/dink-outside-the-box-a-football-podcast/id1550521497" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.




In [23]:
# Save this csv used for coreference resolution
final_df.to_csv("NewFinalFinalComments10-21.csv", index=False)
final_df

Unnamed: 0,ComID,Comment,Author,Publish Date,ProComment
0,dl4xjfx,The transfer that will give birth to many craz...,v-d-c,2017-08-04 02:41:47,The transfer that will give birth to many craz...
1,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27,Money is not the reason Neymar left. Barca is ...
2,dl51noz,sock,vnfootball1,2017-08-04 04:07:19,sock
3,dl53ej5,This is always the most ignorant argument made...,jesonnier,2017-08-04 04:45:50,This is always the most ignorant argument made...
4,dl562z6,A very sad day in the history of transfers. No...,404randomguy404,2017-08-04 05:49:08,A very sad day in the history of transfers. No...
...,...,...,...,...,...
130942,gz32mmm,DL has a history of being a dirty player. Defi...,PresidenteClint,2021-05-22 21:20:48,DL has a history of being a dirty player. Defi...
130943,gz32ycm,I think Raul scoring on Chelsea and seeing him...,LincolnCoHo,2021-05-22 21:23:38,I think Raul scoring on Chelsea and seeing him...
130944,gz33yy9,"Yeah. I hope DL gets Zuñiga'd someday, just to...",PresidenteClint,2021-05-22 21:32:30,"Yeah. I hope DL gets Zuñiga'd someday, just to..."
130945,gz34cfi,Primeira Liga--Europe's *real* 5th-best league.,PresidenteClint,2021-05-22 21:35:47,Primeira Liga--Europe's *real* 5th-best league.


In [26]:
# Load comments dataset processed by coreference resolution
# cr_df = pd.read_csv("NewCRFinalComments10-21.csv")
# cr_df

Unnamed: 0,ComID,Comment,Author,Publish Date,ProComment
0,dl4xjfx,The transfer that will give birth to many craz...,v-d-c,2017-08-04 02:41:47,The transfer that will give birth to many craz...
1,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27,Money is not the reason Neymar left. Barca is ...
2,dl51noz,sock,vnfootball1,2017-08-04 04:07:19,sock
3,dl53ej5,This is always the most ignorant argument made...,jesonnier,2017-08-04 04:45:50,This is always the most ignorant argument made...
4,dl562z6,A very sad day in the history of transfers. No...,404randomguy404,2017-08-04 05:49:08,A very sad day in the history of transfers. No...
...,...,...,...,...,...
130942,gz32mmm,DL has a history of being a dirty player. Defi...,PresidenteClint,2021-05-22 21:20:48,DL has a history of being a dirty player. Defi...
130943,gz32ycm,I think Raul scoring on Chelsea and seeing him...,LincolnCoHo,2021-05-22 21:23:38,I think Raul scoring on CRaullsea and seeing D...
130944,gz33yy9,"Yeah. I hope DL gets Zuñiga'd someday, just to...",PresidenteClint,2021-05-22 21:32:30,"Yeah. I hope DL gets Zuñiga'd someday, just to..."
130945,gz34cfi,Primeira Liga--Europe's *real* 5th-best league.,PresidenteClint,2021-05-22 21:35:47,Primeira Liga--Europe's *real* 5th-best league.


In [27]:
# step 4: split comments into sentences
def split_comments(df):
    df = df.assign(ProComment=df['ProComment'].str.split('.')).explode('ProComment')
    # There are blank or emplty cell values after above process. Removing them
    df["ProComment"].replace('', np.nan, inplace=True)
    df.dropna(inplace=True)
    # reset index
    df = df.reset_index(drop=True)
    for i in range(df.shape[0]):
        df.iloc[i]["ProComment"] = df.iloc[i]["ProComment"].strip()
    return df

# sentences 98560 -> 227790
# sents_df = split_comments(final_df)
# sents_df
crsents_df = split_comments(cr_df)
crsents_df

Unnamed: 0,ComID,Comment,Author,Publish Date,ProComment
0,dl4xjfx,The transfer that will give birth to many craz...,v-d-c,2017-08-04 02:41:47,The transfer that will give birth to many craz...
1,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27,Money is not the reason Neymar left
2,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27,Barca is financially capable of paying whateve...
3,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27,"Neymar wants to get out of Messi's shadow, and..."
4,dl4zg3i,Money isn't the reason Neymar left. Barca is f...,djb2spirit,2017-08-04 03:21:27,Whereas the alternative is waiting years for M...
...,...,...,...,...,...
309222,gz33yy9,"Yeah. I hope DL gets Zuñiga'd someday, just to...",PresidenteClint,2021-05-22 21:32:30,Yeah
309223,gz33yy9,"Yeah. I hope DL gets Zuñiga'd someday, just to...",PresidenteClint,2021-05-22 21:32:30,"I hope DL gets Zuñiga'd someday, just to humbl..."
309224,gz34cfi,Primeira Liga--Europe's *real* 5th-best league.,PresidenteClint,2021-05-22 21:35:47,Primeira Liga--Europe's *real* 5th-best league
309225,gz3b729,Lykke til bror. Kom til Mjøndalen!,20mcan20,2021-05-22 22:33:57,Lykke til bror


In [28]:
# step 5: export the processed dataset for name entity recognition
# sents_df.to_csv("FinalCommentsSents10-21.csv", index=False)
# sents_df
crsents_df.to_csv("NewCRFinalCommentsSents10-21.csv", index=False)
crsents_df