# Transformation of scrapped data

### Sommaire

1- Loading data

2- Removing for duplicates

3- Making a Join on comments with posts

4- Add a feature sentiment analysis

5- Exporting dataframe with all comments

6- Making calculation by post (grouping by information of comments)

In [16]:
import glob
import pandas as pd
pd.set_option("display.max_columns", 50)
import warnings
import re
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
en = spacy.load('en_core_web_sm')
import datetime

from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer

# Parameters

In [17]:
subreddit = 'france'
update = True

# Loading data

## Comments & posts

In [18]:
warnings.simplefilter(action='ignore', category=FutureWarning)

# getting csv files from the folder
path = "../scrapping/exports/" + subreddit + "/comments"

# read all the files with extension .csv
filenames = glob.glob(path + "\*.csv")
all_comments = pd.DataFrame()
# for loop to iterate all csv files
for file in filenames:
   # reading csv files
   print("\nReading file = ",file)
   all_comments = all_comments.append(pd.read_csv(file))

all_comments = all_comments.reset_index(drop=True)


Reading file =  ../scrapping/exports/france/comments\france_comments_x67dzf.csv

Reading file =  ../scrapping/exports/france/comments\france_comments_x6ppsd.csv

Reading file =  ../scrapping/exports/france/comments\france_comments_xci5m1.csv

Reading file =  ../scrapping/exports/france/comments\france_comments_xhtzdy.csv

Reading file =  ../scrapping/exports/france/comments\france_comments_yg3dj6.csv


In [19]:
# getting csv files from the folder
path = "../scrapping/exports/" + subreddit + "/posts"

# read all the files with extension .csv
filenames = glob.glob(path + "\*.csv")
print('File names:', filenames)
all_titres = pd.DataFrame()
# for loop to iterate all csv files
for file in filenames:
   # reading csv files
   print("\nReading file = ",file)
   all_titres = all_titres.append(pd.read_csv(file))

all_titres = all_titres.reset_index(drop=True)

File names: ['../scrapping/exports/france/posts\\france_20220901_20221030.csv']

Reading file =  ../scrapping/exports/france/posts\france_20220901_20221030.csv


## Previous transformed data

In [20]:
previous_df = pd.read_parquet('exports/' + subreddit + '/' + subreddit + '_comments_merged.parquet', engine='pyarrow')

## Removing duplicates

In [21]:
all_comments = all_comments.drop_duplicates()
all_comments.rename(columns = {'commentId':'comment_id', 'parent_commentId':'parent_comment_id'}, inplace = True)

all_titres = all_titres.drop_duplicates()
all_titres.rename(columns = {'postId':'post_id'}, inplace = True)

Checking if we still have duplicated on the IDs of both dataframes

In [22]:
assert len(all_comments[all_comments.duplicated(['comment_id'])]) == 0, "Meh, I found some duplicated comments IDs in the dataframe"
assert len(all_titres[all_titres.duplicated(['post_id'])]) == 0, "Meh, I found some duplicated post IDs in the dataframe"

# Joining comments & posts

In [23]:
comments = all_titres.merge(all_comments, on="post_id", how = "left", suffixes=['_post', '_comment'])
columns_name = {'authors':'author_comment',
    'author':'author_post',
    'body':'text_post',
    'text':'text_comment'}
comments.rename(columns = columns_name, inplace = True)

## Preprocess 
### Date

In [24]:
comments['created_comment'] = pd.to_datetime(comments['created_comment'], format= '%Y/%m/%d')
comments['created_post'] = pd.to_datetime(comments['created_post'], format= '%Y/%m/%d')

In [25]:
comments['year_comment']= comments['created_comment'].dt.year
comments['month_comment']= comments['created_comment'].dt.month
comments['day_comment']= comments['created_comment'].dt.day
comments['year_post']= comments['created_post'].dt.year
comments['month_post']= comments['created_post'].dt.month
comments['day_post']= comments['created_post'].dt.day
del comments['created_comment']
del comments['created_post']

### Add a feature sentiment analysis

a- Construction of a NLP pipeline to clean the comments

In [26]:
def nlp_pipeline(comment) -> str:
    comment = str(comment).lower()
    comment = comment.replace('\n', ' ').replace('\r', '')
    comment = ' '.join(comment.split())
    comment = re.sub(r"[A-Za-z\.]*[0-9]+[A-Za-z%°\.]*", "", comment)
    comment = re.sub(r"(\s\-\s|-$)", "", comment)
    comment = re.sub(r"[,\!\?\%\(\)\/\"]", "", comment)
    comment = re.sub(r"\&\S*\s", "", comment)
    comment = re.sub(r"\&", "", comment)
    comment = re.sub(r"\+", "", comment)
    comment = re.sub(r"\#", "", comment)
    comment = re.sub(r"\$", "", comment)
    comment = re.sub(r"\£", "", comment)
    comment = re.sub(r"\%", "", comment)
    comment = re.sub(r"\:", "", comment)
    comment = re.sub(r"\@", "", comment)
    comment = re.sub(r"\-", "", comment)

    return comment

In [27]:
stop_words_fr = set(STOP_WORDS)
deselect_stop_words = ['ne','pas','plus','personne','aucun','ni','aucune','rien']
for w in deselect_stop_words:
    if w in stop_words_fr:
        stop_words_fr.remove(w)
    else:
        continue
stop_words_en = en.Defaults.stop_words
stop_words = stop_words_fr.union(stop_words_en)

### Applying pipeline and removing stopwords from our dataframe

In [28]:
comments['text_processed'] = comments['text_comment'].apply(nlp_pipeline)
comments['text_processed'] = comments['text_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

comments['title_processed'] = comments['title'].apply(nlp_pipeline)
comments['title_processed'] = comments['title_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

### Using the commentBlob library to get sentiment analysis

In [29]:
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())

senti_num_list = []
senti_cat_list = []
for i in comments["text_processed"]:
    vs = tb(i).sentiment[0]
    senti_num_list.append(vs)
    if (vs > 0.08):
        senti_cat_list.append('Positive')
    elif (vs < -0.08):
        senti_cat_list.append('Negative')
    else:
        senti_cat_list.append('Neutral')

comments['sentiment_num'] = senti_num_list
comments['sentiment_cat'] = senti_cat_list

In [30]:
print('## Original comment ##')
print(comments['text_comment'][100])
print('## Score comment ##')
print(comments['sentiment_num'][100])

## Original comment ##
Oui et le crime de génocide ont été jugés par le TPI-R pour le Rwanda et le TPI-Y pour les crimes durant l'ex-Yougoslavie (Srbrenica a été un élément déclencheur). Ce n'était pas parfait notamment pour le Rwanda où beaucoup de responsables n'ont pas été jugés mais Carla Del Ponte a fait un boulot énorme.

Elle est partie après avoir bataillé pour tenter de stopper les atrocités de la guerre en Syrie mais le Conseil de Sécurité de l'ONU est trop puissant, avec son fonctionnement des États membres avec droit de veto.
## Score comment ##
0.11


## Posts aggregating

In [31]:
# We separate posts we no comments for later calculations and set sentiment_mean and nb_comment to 0.
posts_no_com = comments[comments['comment_id'].isna()]
posts_no_com['sentiment_mean'] = 0
posts_no_com['nb_comment'] = 0
assert len(posts_no_com[posts_no_com.duplicated(['post_id'])]) == 0, "Meh, I found some duplicated post IDs in the dataframe"

#We drop rows of posts where we had 0 comment, we had to keep it before to calculate the number of comments per posts.
comments = comments[comments['comment_id'].notna()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_no_com['sentiment_mean'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_no_com['nb_comment'] = 0


In [32]:
#Grouping by all columns and calculating mean of metrics
posts = comments.groupby(['post_id','title', 'text_post', 'url', 'author_post', 'permalink_post','flair', 'year_post', 'month_post', 'day_post', 'title_processed'], as_index=False, dropna=False).agg(
            {
                'sentiment_num':'mean',
                'comment_id':'size'
            })
posts["comment_id"] = posts["comment_id"].astype('int64')
#We now append with posts with no comment
posts = posts.append(posts_no_com[posts.columns], ignore_index=True)
assert len(posts) == len(all_titres), "We don't have the same amount of posts than at the beginning."

posts.rename(columns = {'sentiment_num':'sentiment_mean','comment_id':'nb_comment'}, inplace = True)
#Append nb_comment is Nan so we're filling with 0
posts['nb_comment'] = posts['nb_comment'].fillna(0)

In [33]:
def save_comments_transform(df:pd.DataFrame):
    file_name = subreddit + '_comments_merged'
    path_csv = 'exports/france/' + file_name + '.csv'
    df.to_csv(path_csv, index = False, encoding = 'utf-8')
    print('Saved csv df in : ' + path_csv)
    path_parquet = 'exports/france/' + file_name + '.parquet'
    df.to_parquet(path_parquet, index = False, engine='pyarrow')
    print('Saved parquet df in : ' + path_parquet)

def save_posts_transform(df:pd.DataFrame):
    file_name = subreddit + '_posts_merged'
    path_csv = 'exports/france/' + file_name + '.csv'
    df.to_csv(path_csv, index = False, encoding = 'utf-8')
    print('Saved csv df in : ' + path_csv)
    path_parquet = 'exports/france/' + file_name + '.parquet'
    df.to_parquet(path_parquet, index = False, engine='pyarrow')
    print('Saved parquet df in : ' + path_parquet)

save_comments_transform(comments)
save_posts_transform(posts)

Saved csv df in : exports/france/france_comments_merged.csv
Saved parquet df in : exports/france/france_comments_merged.parquet
Saved csv df in : exports/france/france_posts_merged.csv
Saved parquet df in : exports/france/france_posts_merged.parquet


In [34]:
posts.head(2)

Unnamed: 0,post_id,title,text_post,url,author_post,permalink_post,flair,year_post,month_post,day_post,title_processed,sentiment_mean,nb_comment
0,x2q6xn,"Toi qui liras ça, si tu as un ami fidèle",ne le laisse jamais tomber. \n\nSi cet ami te...,https://www.reddit.com/r/france/comments/x2q6x...,JeuDeLaVie,/r/france/comments/x2q6xn/toi_qui_liras_ça_si_...,,2022,9,1,liras ami fidèle,0.061429,7
1,x2qxry,[THREAD] - Comment refaire de la France une su...,[removed],https://www.reddit.com/r/france/comments/x2qxr...,Wonderful-Excuse4922,/r/france/comments/x2qxry/thread_comment_refai...,Économie,2022,9,1,[thread]comment refaire france superpuissance ...,0.09546,39
