# How do mothers and fathers talk about parenting to different audiences? 

### Import modules

In [None]:
## Load needed modules
import re
import string
import nltk
import requests as rq
import json
import time
import sys ## for printing only
import tqdm ## This is for a progress bar
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import check_array
from sklearn.preprocessing import normalize
from sklearn.feature_extraction import text

# 2. Cleaning and preprocessing data

In [None]:
#read the files
selected_daddit = pd.read_excel(r'comments_daddit.xlsx', header=None, names=['author', 'body', 'created_utc', 'subreddit'])
selected_daddit_parenting = pd.read_excel(r'comments_parenting_dads.xlsx', header=None, names=['author', 'body', 'created_utc', 'subreddit'])
parenting_selection_mommit = pd.read_excel('comments_parenting_moms.xlsx')
mommit_selection = pd.read_excel('comments_mommit.xlsx')

In [None]:
#check the number of r/Mommit users who published on r/Parenting and their comments
print(len(mommit_selection))
print(len(parenting_selection_mommit))
print(len(set(mommit_selection['author'])))

In [None]:
#check the number of r/Daddit users who published on r/Parenting and their comments
print(len(selected_daddit))
print(len(selected_daddit_parenting))
print(len(set(selected_daddit_parenting['author'])))

In [None]:
#merge the four files into two data frame
motherhood = pd.concat([mommit_selection, parenting_selection_mommit], ignore_index = True)
fatherhood = pd.concat([selected_daddit, selected_daddit_parenting], ignore_index = True)

In [None]:
#add a column gender
fatherhood['gender'] = 'father'
motherhood['gender'] = 'mother'
#merge the two data frames into one with all authors
all_parents = pd.concat([fatherhood, motherhood], ignore_index = True)
all_parents.shape

In [None]:
#check if there are authors who published on bother r/Mommit and r/Daddit
common_authors = set(mommit_selection["author"]).intersection(set(selected_daddit["author"]))
len(common_authors)

In [None]:
#from the previous set, create a list of common authors
no_common = []
for author in set(all_parents["author"]): 
    if author not in common_authors:
        no_common.append(author)
len(no_common)

In [None]:
#check if the length corresponds to the initial number of authors
len(no_common) + len(common_authors)*2

In [None]:
#Remove missing values
all_parents.dropna(inplace = True)
#Remove common authors
all_parents = all_parents.loc[all_parents['author'].isin(no_common)]
all_parents = all_parents.reset_index(drop=True)

In [None]:
#remove from the dataset the removed submissions
removed_comment = 0
removed_indices = []
for i in range(0,len(all_parents)):
    if "your submission has been removed" in all_parents['body'][i].lower():
        removed_comment += 1
        removed_indices.append(i)
print(removed_comment)
print(removed_indices)

all_parents.drop(all_parents.index[removed_indices], inplace = True)
all_parents = all_parents.reset_index(drop=True)

In [None]:
#check the new length of the data frame
len(all_parents)

In [None]:
#Removing deleted comments
all_parents = all_parents[all_parents["author"] != "[deleted]"]
all_parents = all_parents[all_parents["author"] != "AutoModerator"]
all_parents.shape

In [None]:
#We will keep only the authors without the word "bot" in them (it indicates that they are not real authors)
bots_parents = set()
for author in all_parents['author']: 
    if "bot" in author.lower(): 
        bots_parents.add(author)
print(bots_parents)
print(f"{len(bots_parents)} bots will be removed")

In [None]:
print(f"{len(all_parents)} comments in parenthood before removing the bots")
new_bots_parents = list(bots_parents)
for author in new_bots_parents:
    if author == "Phlebotanist" or author == "BotchedUpElia" or author == "redbottleofshampoo":
        new_bots_parents.remove(author)
#Removing the bots
all_parents = all_parents[~all_parents['author'].isin(new_bots_parents)]
print(f"{len(all_parents)} comments left after removing the ones written by bots.")

In [None]:
#Reset the indexes
all_parents = all_parents.reset_index(drop=True)
#remove missing values
all_parents.dropna(inplace = True)

In [None]:
#save this version
all_parents.to_pickle('all_parents_nobots.pkl')

In [None]:
#create a copy of all_parents so the first data frame stays untouched
data_clean = all_parents.copy()
#Remove missing values
data_clean.dropna(inplace = True)
#Reset the indexes
data_clean = data_clean.reset_index(drop=True)

In [None]:
# first round of text cleaning techniques

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[^\u0000-\u05C0\u2100-\u214F]+",'', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
# add a new column to the data frame and apply the first round of cleaning to it
data_clean['preprocessed'] = data_clean['body'].copy()
data_clean['preprocessed'] = data_clean.preprocessed.apply(round1)

In [None]:
# second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# Apply the second round of cleaning
data_clean['preprocessed'] = data_clean['preprocessed'].apply(round2)

In [None]:
#save the cleaned version of the file
data_clean.to_pickle('data_clean.pkl')

In [None]:
#create a function to tokenize the data
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

Create a version of the data set with only nouns

In [None]:
# create a version of the dataset with only nouns
# create a function to pull out nouns from a string of text
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import nltk
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def nouns(text):
    '''Given a string of text, tokenize the text, lemmatize it and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    lemmatized =[]
    for w in tokenized:
        lemmatized.append(lemmatizer.lemmatize(w))
    all_nouns = [word for (word, pos) in pos_tag(lemmatized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
data_nouns = data_clean.copy()
data_nouns.body = data_clean.body.apply(nouns)

In [None]:
tokenized_nouns = data_nouns.copy()
tokenized_nouns.body = tokenized_nouns.body.apply(tokenize)

In [None]:
#check if there empty comments
empty_comments = 0
for comment in tokenized_nouns['body']:
    if len(comment) == 0:
        empty_comments += 1
print(empty_comments)

In [None]:
#remove empty comments
tokenized_nouns = tokenized_nouns[tokenized_nouns.astype(str)['body'] != "[]"]
#Reset the indexes
tokenized_nouns = tokenized_nouns.reset_index(drop=True)

In [None]:
#stop words
#add_stop_words = ["i", "kid", "thing", "im", "wa", "youre", "lot", "dont", "thats", "shes", "ha", "anything", "everything", "bit", "part", "everyone", "one", "doesnt", "theyre", "etc", "u", "didnt", "mine", "anyone", "isnt", "well", "yeah", "get", "yes", "while", "whats", "amount", "youve", "youll", "haha", "cant", "le", "lo", "kiddo", "arent", "ive", "wouldnt", "op", "top", "half", "let", "wont", "set", "wasnt", "none", "yours", "weve", "ask", "couldnt", "theyll", "yo", "reddit", "ours", "go", "gon", "gt", "wish", "app", "tell", "come", "want", "itll", "ok", "yep", "bc", "youd", "theyve", "okay", "nope", "thread", "oh", "aspect", "kiddos", "omg", "shouldnt", "take", "yr", "v", "till", "push", "fine", "x", "d", "mo", "hi", "b", "hers", "theyd", "yup", "hahaha", "er", "boy", "baby", "child", "parent", "way", "girl", "son", "daughter", "mom", "dad", "husband", "woman", "lol", "husband", "wife", "brother", "sister", "mother", "father"]
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
#remove stopwords from tokenized dataset
tokenized_nouns['body'] = tokenized_nouns['body'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
tokenized_nouns.to_pickle('tokenized_nouns.pkl')

In [None]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format), the dtm, and the CountVectorizer object
data_nouns.to_pickle('data_nouns.pkl')

Create a version of the data set with nouns and verbs

In [None]:
# create a version of the dataset with nouns and verbs
# create a function to pull out nouns from a string of text
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import nltk
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def nouns_verbs(text):
    '''Given a string of text, tokenize the text, lemmatize it and pull out only the nouns.'''
    is_noun_verb = lambda pos: pos[:2] == 'NN'or pos[:2] == 'VB'
    tokenized = word_tokenize(text)
    lemmatized =[]
    for w in tokenized:
        lemmatized.append(lemmatizer.lemmatize(w))
    all_nouns = [word for (word, pos) in pos_tag(lemmatized) if is_noun_verb(pos)] 
    return ' '.join(all_nouns)

In [None]:
data_nouns_verbs = data_clean.copy()
data_nouns_verbs.body = data_nouns_verbs.body.apply(nouns_verbs)

In [None]:
tokenized_nouns_verbs = data_nouns_verbs.copy()
tokenized_nouns_verbs.body = tokenized_nouns_verbs.body.apply(tokenize)

In [None]:
#check if there empty comments
empty_comments = 0
for comment in tokenized_nouns_verbs['body']:
    if len(comment) == 0:
        empty_comments += 1
print(empty_comments)

In [None]:
#remove empty comments
tokenized_nouns_verbs = tokenized_nouns_verbs[tokenized_nouns_verbs.astype(str)['body'] != "[]"]
#Reset the indexes
tokenized_nouns_verbs = tokenized_nouns_verbs.reset_index(drop=True)

In [None]:
#stop words
#add_stop_words = ["i", "kid", "thing", "im", "wa", "youre", "lot", "dont", "thats", "shes", "ha", "anything", "everything", "bit", "part", "everyone", "one", "doesnt", "theyre", "etc", "u", "didnt", "mine", "anyone", "isnt", "well", "yeah", "get", "yes", "while", "whats", "amount", "youve", "youll", "haha", "cant", "le", "lo", "kiddo", "arent", "ive", "wouldnt", "op", "top", "half", "let", "wont", "set", "wasnt", "none", "yours", "weve", "ask", "couldnt", "theyll", "yo", "reddit", "ours", "go", "gon", "gt", "wish", "app", "tell", "come", "want", "itll", "ok", "yep", "bc", "youd", "theyve", "okay", "nope", "thread", "oh", "aspect", "kiddos", "omg", "shouldnt", "take", "yr", "v", "till", "push", "fine", "x", "d", "mo", "hi", "b", "hers", "theyd", "yup", "hahaha", "er", "boy", "baby", "child", "parent", "way", "girl", "son", "daughter", "mom", "dad", "husband", "woman", "lol", "husband", "wife", "brother", "sister", "mother", "father"]
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
#remove stopwords from tokenized dataset
tokenized_nouns['body'] = tokenized_nouns['body'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
tokenized_nouns_verbs.to_pickle('tokenized_nouns_verbs.pkl')