## free-marketing-watch
Search social media for mentions of brands and collect the comments/tweets/etc.
Count mentions of each and perform sentiment analysis on the strings.

In [1]:
import praw
import pandas as pd
from secrets import *
from pathlib import Path
from brands import fashion
import re

In [2]:
reddit = praw.Reddit(client_id=client_id,
               client_secret=client_secret,
               user_agent=user_agent)

Now to get the comments data, put it in a dataframe, and clean the data to get what we want.

In [3]:
def create_comments_df(subreddit_):
    """Returns a pandas df with the information about comments from this year.

    Inputs
    -----
    str: subreddit to be searched.
    Return
    ------
    Pandas dataframe with all the data from the praw object.m
    """
    subreddit = reddit.subreddit(subreddit_)
    submission_list = subreddit.search(
        'WAYWT',sort='new', time_filter="year", limit=1000
    )  # generator of submissions in the subreddit
    comment_list = []
    for submission in submission_list:
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            comment_list.append(comment)

    df = pd.DataFrame([vars(comment) for comment in comment_list])
    df2 = df.loc[:,['link_id','id','score','body']]
    df2['Subreddit'] = subreddit_
    return df2


In [4]:
def brand_check(df,brandlist):
    """Checks comment body against a list of brands to see if it mentions any.
       Adds what brand was found if any in the brands column.
       
       Inputs
       ------
       Dataframe you will search over and a list of brands in a separate file. 
       Return
       ------
       Dataframe with column indicating what brand was found in the values.
       """

    df2 = df.body.str.extractall(pat=brandlist, flags=re.VERBOSE)
    df2.dropna(axis=0,how='all',inplace=True)
    return df2

### This takes a long time, probably around 30 minutes per 100,000 comments.

In [11]:
df1 = create_comments_df('femalefashionadvice')
df1

Unnamed: 0,link_id,id,score,body,Subreddit
0,t3_kl62co,gh7lg2l,79,30 / Seattle / WFH / CCW\n\nThis past week has...,femalefashionadvice
1,t3_kl62co,gh73lm9,36,29/FL/Working from home | My IG: [secondhandps...,femalefashionadvice
2,t3_kl62co,gh7rhkd,30,33/Midwest US/stay at home mom of two toddlers...,femalefashionadvice
3,t3_kl62co,gh7tcea,25,27 / kentucky / nonprofit wfh\n\n[in my trusty...,femalefashionadvice
4,t3_kl62co,gh823b6,28,42 / Grad Student on winter break / West Coast...,femalefashionadvice
...,...,...,...,...,...
13370,t3_eif7u5,fcrvhlu,3,thank you! i think i got it at hot topic many ...,femalefashionadvice
13371,t3_eif7u5,fct07im,1,"Hahaha I did not! Class of '09 in NC, but say ...",femalefashionadvice
13372,t3_eif7u5,fct28pu,5,"I will! Your aesthetic reminds me of her, too,...",femalefashionadvice
13373,t3_eif7u5,fct406p,2,a solid aspiration! i wonder if she still carr...,femalefashionadvice


Run to export the df to csv. Careful about overwriting. Use the mode = 'a' line to add to an existing file.


In [12]:
p = Path.cwd() / 'data' / 'waywtdf.csv'
#df1.to_csv(path_or_buf = p)
df1.to_csv(path_or_buf = p, mode = 'a', header=False)

In [5]:
p = Path.cwd() / 'data' / 'waywtdf.csv'
df = pd.read_csv(p)
df

Unnamed: 0.1,Unnamed: 0,link_id,id,score,body,Subreddit
0,0,t3_klxncy,ghc7s6s,12,Really like the crooklyn hat and Santa coat gu...,malefashionadvice
1,1,t3_klxncy,ghbh5m6,4,Thank you to everyone who participated [in yes...,malefashionadvice
2,2,t3_klxncy,ghczbnz,11,Finally a real and unbiased member of the publ...,malefashionadvice
3,3,t3_klxncy,ghdw1as,3,"I believe that's Frank Rossitano. Yeah, probab...",malefashionadvice
4,4,t3_klxncy,ghcr9kf,2,Add funny to that list. What a freaking all star.,malefashionadvice
...,...,...,...,...,...,...
33802,13370,t3_eif7u5,fcrvhlu,3,thank you! i think i got it at hot topic many ...,femalefashionadvice
33803,13371,t3_eif7u5,fct07im,1,"Hahaha I did not! Class of '09 in NC, but say ...",femalefashionadvice
33804,13372,t3_eif7u5,fct28pu,5,"I will! Your aesthetic reminds me of her, too,...",femalefashionadvice
33805,13373,t3_eif7u5,fct406p,2,a solid aspiration! i wonder if she still carr...,femalefashionadvice


If you prefer pickling over csv, use these cells for IO

In [None]:
p = Path.cwd() / 'data' / 'commentdf.pkl'
df.to_pickle(path = p)

In [None]:
df = pd.read_pickle(filepath_or_buffer=p)

If you read from CSV it created an extra column so this cell drops it.

In [6]:
df = df.drop(columns =['Unnamed: 0'])

In [7]:
df2 = brand_check(df,fashion)

In [8]:
pd.set_option("display.max_rows",50, "display.max_columns", None)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1
20,38,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,puma,,,,,,,,,,,,,,,,,,,,,
64,1119,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Puma,,,,,,,,,,,,,,,,,,,,,
68,310,,,,,,,,,,,,,,,,,,,,,,,,Amazon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
68,479,,,,,,,,,,,,,,,,,,,,,,,,Amazon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
70,38,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Express,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33772,2055,Uniqlo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33773,222,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,LV,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33777,700,,,H&M,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33778,4415,,,,,,,,,,,,,,BR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
df2[0].value_counts()

 Uniqlo       394
\nUniqlo      113
 Uniqlo        97
Uniqlo         82
 uniqlo        79
uniqlo         72
 Uniqlo\n      60
 uniqlo        52
Uniqlo         35
\nUniqlo       20
 uniqlo\n      15
\nuniqlo       12
\nuniqlo        9
uniqlo          8
\nUniqlo\n      8
\tUniqlo\n      4
\tUniqlo        4
Uniqlo\n        4
\nuniqlo\n      3
\tuniqlo        2
\tuniqlo\n      2
uniqlo\n        2
Name: 0, dtype: int64