# Reddit Data Cleaning

In [15]:
import pandas as pd
import json
import re
import os

In [16]:
def flatten_deep(arr: list):
    """ Flattens arbitrarily-nested list `arr` into single-dimensional. """

    while arr:
        if isinstance(arr[0], list) and not isinstance(arr[0], str):  # Checks whether first element is a list
            arr = arr[0] + arr[1:]  # If so, flattens that first element one level
        else:
            yield arr.pop(0)  # Otherwise yield as part of the flat array

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def get_curl_pattern(c, cp_reddit=''):
    curlpattern = ""
    my_split = re.split('[\W\:\(\)]+',c)

    curlpattern = " ".join(intersection(['1a','1b','1c','2a','2b','2c','3a','3b','3c','4a','4b','4c'], my_split))
    
    if curlpattern == "" and cp_reddit == 'wavyhair':
        curlpattern = "2"
    elif curlpattern == "" and cp_reddit == 'curlyhair':
        curlpattern = "3"
        
    return(curlpattern)



def get_products(c):
    # types of products
    product_types = ['shampoo', 'conditioner', 'gel', 'mousse', 'leave in', 'leave-in', 'cream', 'wash']
    
    # all lines in comment
    product_lists = [x for x in c.split("\n") if len(intersection(x.split(), product_types)) > 0]
    
    return(product_lists)
    
def parse_comment(c, cp_reddit=""):
    comment_dict = get_hair_stats(c, cp_reddit)
    comment_dict['products'] = get_products(c)
    
    return(comment_dict)

In [17]:
def get_stats(c):
    stopwords = ['porosity','density','thickness']
    my_split = re.split('[\W\:\(\)]+',c)
    
    # get indices for all stopwords in comment
    indices = pd.DataFrame.from_dict([{'key':x, 'val':my_split.index(x)}  for x in stopwords if x in my_split])
    info_dict = {}
    
    if (len(indices) > 0):
        indices = indices.sort_values(by='val')
    
        # get substrings immediately before the descriptor
        prev_idx = -1
        for idx, row in indices.iterrows():
            info_dict[row['key']] = ' '.join([x for x in my_split[max(prev_idx+1, row['val']-4):row['val']] if x in ['low', 'med', 'medium', 'high', 'average']])
            prev_idx=row['val']
    # set non_descriptors to null string
    for desc in [x for x in stopwords if x not in info_dict]:
        info_dict[desc] = ""
        
    if info_dict['thickness'] == '':
        if 'fine' in c:
            info_dict['thickness'] = 'fine'
        elif 'coarse' in c or 'thick' in c:
            info_dict['thickness'] = 'coarse'
    return(info_dict)

## Combine Data
First, we'll combine data from our three subreddits

In [18]:
all_data = []
datapath='../output/comments/wavyhair/'
for fname in [f for f in os.listdir(datapath) if os.path.isfile(datapath+f)]:
    x = pd.read_csv(datapath + fname)
    x['comments'] = x['comments'].str.lower()
    x['subreddit'] = 'r/wavyhair'
    all_data.append(x)
    
datapath='../output/comments/curlyhair/'
for fname in [f for f in os.listdir(datapath) if os.path.isfile(datapath+f)]:
    x = pd.read_csv(datapath + fname)
    x['comments'] = x['comments'].str.lower()
    x['subreddit'] = 'r/curlyhair'
    all_data.append(x)
    
datapath='../output/comments/naturalhair/'
for fname in [f for f in os.listdir(datapath) if os.path.isfile(datapath+f)]:
    x = pd.read_csv(datapath + fname)
    x['comments'] = x['comments'].str.lower()
    x['subreddit'] = 'r/naturalhair'
    all_data.append(x)

In [19]:
all_data = pd.concat(all_data)

In [20]:
all_data

Unnamed: 0,index,id,comments,subreddit
0,Mediumnotch,eh00u5,"bleaching my hair basically killed my waves, t...",r/wavyhair
1,Sharkelberryfinn,eh00u5,you might be able to get away with just using ...,r/wavyhair
2,SleepyOne123,eh00u5,olaplex 3 every week has been a game changer f...,r/wavyhair
3,jewolfin,eh00u5,bleached wavy hair (help!)&&&hello all! so i'm...,r/wavyhair
4,AMillionFreckles,egtv0c,hair type help- wavy or curly? my top layer wa...,r/wavyhair
...,...,...,...,...
82,Imanon12354,oqs7xh,why do i sometimes have bald spots on my head ...,r/naturalhair
83,Sunnymajor,oqs7xh,i'm not doctor but... it could be alopecia. \n...,r/naturalhair
84,my-weird-reality,oqs7xh,do you wear tight braids or slick your hair of...,r/naturalhair
85,maryplant,oqqrql,advice&&&i am mixed with 4a type hair and both...,r/naturalhair


## Retrieve hair stats

In [21]:
all_data['curlpattern'] = all_data['comments'].apply(get_curl_pattern)

In [22]:
hair_stats = pd.DataFrame.from_dict(all_data['comments'].apply(get_stats).values.tolist())
hair_stats

Unnamed: 0,porosity,density,thickness
0,,,
1,,,coarse
2,,,
3,,,
4,,,
...,...,...,...
26269,,,
26270,,,
26271,,,
26272,,,


In [23]:
all_data = pd.concat([all_data.reset_index(drop=True),hair_stats.reset_index(drop=True)], axis=1)
all_data.head()

Unnamed: 0,index,id,comments,subreddit,curlpattern,porosity,density,thickness
0,Mediumnotch,eh00u5,"bleaching my hair basically killed my waves, t...",r/wavyhair,,,,
1,Sharkelberryfinn,eh00u5,you might be able to get away with just using ...,r/wavyhair,,,,coarse
2,SleepyOne123,eh00u5,olaplex 3 every week has been a game changer f...,r/wavyhair,,,,
3,jewolfin,eh00u5,bleached wavy hair (help!)&&&hello all! so i'm...,r/wavyhair,2a 2b,,,
4,AMillionFreckles,egtv0c,hair type help- wavy or curly? my top layer wa...,r/wavyhair,3a,,,


In [24]:
all_data.density.value_counts()

                     26020
medium                  82
high                    66
low                     62
medium high             16
med high                 4
medium low               4
medium medium            4
low medium               4
average                  3
med                      2
med low                  1
medium low medium        1
high medium low          1
low low                  1
low high                 1
low med                  1
high low                 1
Name: density, dtype: int64

In [25]:
all_data.porosity = [x if x in ['low','high', ''] else 'medium' for x in all_data.porosity]
all_data.density = [x if x in ['low','high', ''] else 'medium' for x in all_data.density]
all_data.thickness = [x if x in ['low','fine', 'coarse', 'high', ''] else 'medium' for x in all_data.thickness]
all_data.thickness = ['fine' if x in ['low','fine'] else x for x in all_data.thickness]
all_data.thickness = ['coarse' if x in ['high','coarse'] else x for x in all_data.thickness]

In [26]:
all_data.head()

Unnamed: 0,index,id,comments,subreddit,curlpattern,porosity,density,thickness
0,Mediumnotch,eh00u5,"bleaching my hair basically killed my waves, t...",r/wavyhair,,,,
1,Sharkelberryfinn,eh00u5,you might be able to get away with just using ...,r/wavyhair,,,,coarse
2,SleepyOne123,eh00u5,olaplex 3 every week has been a game changer f...,r/wavyhair,,,,
3,jewolfin,eh00u5,bleached wavy hair (help!)&&&hello all! so i'm...,r/wavyhair,2a 2b,,,
4,AMillionFreckles,egtv0c,hair type help- wavy or curly? my top layer wa...,r/wavyhair,3a,,,


In [27]:
all_data.density.value_counts()

          26020
medium      126
high         66
low          62
Name: density, dtype: int64

## Output clean data

In [28]:
all_data['cantu'] = all_data['comments'].str.contains('cantu')
all_data['cd'] = all_data['comments'].str.contains("carols|carol's")
all_data['garnier'] = all_data['comments'].str.contains('garnier')
all_data['giovanni'] = all_data['comments'].str.contains('giovanni|giovani')
all_data['kc'] = all_data['comments'].str.contains('kinky curly')
all_data['maui'] = all_data['comments'].str.contains('maui')
all_data['mt'] = all_data['comments'].str.contains('moptop|mop top')
all_data['nym'] = all_data['comments'].str.contains('nym|not your mother')
all_data['pantene'] = all_data['comments'].str.contains('pantene')
all_data['sm'] = all_data['comments'].str.contains('shea moisture')
all_data['suave'] = all_data['comments'].str.contains('suave')
all_data['tj'] = all_data['comments'].str.contains('trader joe')
all_data['tresemme'] = all_data['comments'].str.contains('tresemme|tresame|tressame|tresamme')

In [29]:
all_data.to_csv("../output/additionalinfo.csv", index=False)