# Sentiment Analysis and Consumer Profiling

This script includes cleaning, EDA, feature creation, and some preliminary analysis. 

In [112]:
import pandas as pd
import numpy as np 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib 
import matplotlib.patches as mpatches
import seaborn as sns
from textblob import TextBlob
from collections import Counter

In [113]:
food = pd.read_csv('./fastfood.csv', dtype=object, index_col=0)

In [114]:
food.shape

(146350, 14)

In [115]:
#dropping malformed data (invalid index)
food.drop(list(food.loc[food['unique_code'] == 'Nobody should be too big to fail...'].index),
          axis=0, inplace=True)

In [116]:
#dropping duplicate entries
food = food.drop_duplicates(subset='unique_code')

In [117]:
#eliminated more than 50% of observations
food.shape

(62420, 14)

In [118]:
food.reset_index(inplace=True, drop=True)

In [119]:
#dropping all rows with all null values
food = food.drop(food[food.isnull().all(axis=1)].index[0], axis=0).reset_index(drop=True)

In [120]:
#Dropping all values that are not company related (only 3 observations)
to_drop = []

for row_num, val in enumerate(food['Company']):
    if val[0] != '@':
        to_drop.append(row_num)
        
food = food.drop(to_drop, axis=0).reset_index(drop=True)

In [121]:
#replacing strings with integers
mapper = {'True': 1, 'False': 0}
food['user_is_verified'] = food.user_is_verified.map(mapper)

In [122]:
#filling nulls and converting data types
food['retweet_count'] = food.retweet_count.astype(int)

In [123]:
#Set to run Midnight and 5pm EST everyday, the times are in UTC, making EST
food['time_tweeted'] = pd.to_datetime(food['time_tweeted']) - pd.Timedelta(hours=4)

In [124]:
food['favorite_count'] = food.favorite_count.astype(int)
food['number_of_people_they_follow'] = food.number_of_people_they_follow.astype(int)
food['number_of_user_tweets'] = food.number_of_user_tweets.astype(int)
food['user_followers_count'] = food.user_followers_count.astype(int)

In [125]:
#creating a basic name category that isnt the handle

mapper = {'@DennysDiner': 'Dennys', '@ChipotleTweets': 'Chipotle',
         '@McDonalds': 'McDonalds', '@Wendys': 'Wendys', '@Starbucks':'Starbucks',
         '@dunkindonuts':'Dunkin_Donuts', '@dominos': 'Dominos', '@shakeshack': 'Shake_Shack',
         '@sonicdrivein': 'Sonic', '@wingstop': 'Wingstop', '@CrackerBarrel': 'Cracker_Barrel', 
         '@redrobinburgers': 'Red_Robin', '@Potbelly': 'Potbelly'}

food['name'] = food.Company.map(mapper)

@DennysDiner        6964
@ChipotleTweets     6701
@McDonalds          6695
@Wendys             6648
@Starbucks          6579
@dunkindonuts       6343
@dominos            6206
@shakeshack         4090
@sonicdrivein       3990
@wingstop           3112
@CrackerBarrel      3029
@redrobinburgers    1402
@Potbelly            657
Name: Company, dtype: int64

Examined multiple methods of cleaning for sentiment data, only the final method runs now, but 
have kept my trial methods below (hashed out)

In [126]:
#shouldnt be removing stopwords before sentiment analysis:
#http://www.lrec-conf.org/proceedings/lrec2014/pdf/292_Paper.pdf
#testing different functions for preprocessing text for sentiment analysis


def Text_Cleaner(text):
    """Takes text, eliminates URLS, replaces contractions, tokenizes, 
    removes company names, lower cases, removes calls to twitter handles, 
    returns a string, same as version 1, but only looking at words"""
    text = re.sub(r'(https)[^\s]+', '', text)
    text = re.sub(r'can\'t', 'can not', text, flags=re.IGNORECASE)
    text = re.sub(r'don\'t', 'do not', text, flags=re.IGNORECASE)
    text = re.sub(r'isn\'t', 'is not', text, flags=re.IGNORECASE)
    text = re.sub(r'aren\'t', 'are not', text, flags=re.IGNORECASE)
    text = re.sub(r'wasn\'t', 'was not', text, flags=re.IGNORECASE)
    text = re.sub(r'weren\'t', 'were not', text, flags=re.IGNORECASE)
    text = re.sub(r'haven\'t', 'have not', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(rt|RT)', '', text)
    text = re.sub(r'@[a-zA-Z0-9]+', '', text)
    text = re.sub('#', '', text)
    text = re.sub(r'(wtf)+\b', 'what the fuck', text, flags=re.IGNORECASE)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return ' '.join(words)

# def Text_Cleaner_version_1(text):
#     """Takes text, eliminates URLS, replaces contractions, tokenizes, 
#     removes company names, lower cases, removes calls to twitter handles, 
#     returns a string"""
#     text = re.sub(r'(https)[^\s]+', '', text)
#     text = re.sub(r'\b(rt|RT)', '', text)
#     text = re.sub(r'@[a-zA-Z0-9]+', '', text)
#     text = re.sub('#', '', text)
#     return text

#     LOOKING AT VARIOUS METHODS FOR PREPROCESSING FOR SENTIMENT ANALYSIS
#     tokenizer = RegexpTokenizer(r'\w+')
#     words = tokenizer.tokenize(text)
#     lower = [x.lower() for x in words]
#     words = [word for word in words if word != 'rt']
#     eliminator = [re.sub(r'(mcdon|dunki|denn|redro|sonic|starb|shakesh|domino|crackerb|chipot|wend)[a-z]+','',x)
#                   for x in lower]
#     return ' '.join(eliminator2)

In [83]:
# #Creating a test set of uncleaned data to check the value of the Text_Cleaner functions
# food['text_sentiment_no_clean'] = food['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [84]:
# #converting text with version 1
# food['text_sentiment_v1'] = food['text'].apply(Text_Cleaner_version_1)
# #Calculating sentiment with TextBlob
# food['sentiment_score_v1'] = food['text_sentiment_v1'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [127]:
#converting text with version 2
food['text_sentiment'] = food['text'].apply(Text_Cleaner)
#Calculating sentiment with TextBlob
food['sentiment_score'] = food['text_sentiment'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [128]:
#Looking at overall sentiment by company (version 1) (0 neutral, 1 positive, -1 negative)
food.groupby('Company')['sentiment_score'].mean().sort_values()

Company
@dominos            0.067637
@dunkindonuts       0.091108
@wingstop           0.095163
@sonicdrivein       0.103452
@Wendys             0.111101
@McDonalds          0.111456
@DennysDiner        0.119491
@redrobinburgers    0.119933
@ChipotleTweets     0.143496
@CrackerBarrel      0.154101
@Potbelly           0.169525
@Starbucks          0.175698
@shakeshack         0.186101
Name: sentiment_score, dtype: float64

In [130]:
#creating sentiment dummy variables 

def dummy_maker(val):
    """Takes in a float and returns a dummy based on the value
    to be used in pandas.apply"""
    if val == 0:
        return 0
    elif val > 0:
        return 1
    else:
        return -1

food['sentiment_dummies'] = food['sentiment_score'].apply(dummy_maker)

# food['sentiment_dummies_v2'] = food['sentiment_score_v2'].apply(dummy_maker)

# food['sentiment_dummies_uncleaned'] = food['text_sentiment_no_clean'].apply(dummy_maker)

In [131]:
# print(food['sentiment_dummies_v1'].value_counts()) 
# print(food['sentiment_dummies_v2'].value_counts())
print(food['sentiment_dummies'].value_counts())

 1    27184
 0    26025
-1     9207
Name: sentiment_dummies, dtype: int64


In [137]:
#manually testing reliability of the different measures with a random subset
random_numbers = list(np.random.randint(0, 62415, 5))
for num, val in enumerate(food.iloc[random_numbers, :]['text']):
    print(val)
    print('\n')
    print(food.iloc[[num], :]['sentiment_dummies'].values)

Thank you @ChipotleTweets for fixing my bad experience and turning it into a good one with these gift cards ! https://t.co/hSXo9kMlgE


[1]
RT @Ninja: Ended the stream a little early to eat my @ChipotleTweets that my wife @JGhosty got for me today. We weren’t expecting any packa…


[0]
RT @ellaxmaree: Hi @Starbucks I need you to deliver plzzz


[-1]
RT @CrackerBarrel: Happy #FirstDayOfSpring. May your heart bloom with joy this season! 🌷🌸💐🌼🌻 https://t.co/WshGuSSMJ4


[0]
RT @VeraSweeney: Want to see how I completely redid my dining room table after visiting @CrackerBarrel? Even the clothes I am wearing are f…


[1]


In [161]:
def negativity_formatter(company, hourly_rate=False):
    """This function accepts a company and returns either 
    the separate company, positive, negative dataframes (in that order)
    OR the same order plus a dataframe of hourly rates, if hourly_rate = True"""
    df = food.loc[food['name'] == company]
    positive_df = df.loc[df['sentiment_dummies'] == 1]
    negative_df = df.loc[df['sentiment_dummies'] == -1]
    if not hourly_rate:
        return df, positive_df, negative_df
    else:
        rate_df = pd.DataFrame()
        rate_df['pos_count'] = positive_df.groupby(positive_df['time_tweeted'].dt.hour)['Company'].count()
        rate_df['neg_count'] = negative_df.groupby(negative_df['time_tweeted'].dt.hour)['Company'].count()
        rate_df['rate'] = neg / (pos + neg)
        return df, positive_df, negative_df, rate_df
        

In [165]:
#43% of tweets are retweets
food.loc[food['text'].str[:2] == 'RT'].shape[0] / food.shape[0]

0.4308831068956678

In [168]:
#Creating a dummy for whether the tweet is a retweet or not
retweets = []
for val in food['text']:
    if val[:2] == 'RT':
        retweets.append(1)
    else: 
        retweets.append(0)
food['is_a_retweet'] = retweets

# LEFT OFF HERE, BEGIN CREATING CUSTOMER PROFILES

In [26]:
#customer profiles still skewed by large means- looking at medians
without_retweets.groupby('Company')[list(food.select_dtypes(include=np.number).columns)].median()

Unnamed: 0_level_0,favorite_count,number_of_people_they_follow,number_of_user_tweets,retweet_count,user_followers_count,user_is_verified,sentiment_score,sentiment_dummies
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
@ChipotleTweets,0.0,356.0,4801.0,0,318.0,0,0.0,0
@CrackerBarrel,0.0,487.0,5596.0,0,376.0,0,0.0,0
@DennysDiner,0.0,272.5,3419.5,0,195.0,0,0.0,0
@McDonalds,0.0,320.0,3052.0,0,232.5,0,0.0,0
@Potbelly,0.0,483.0,8162.0,0,488.0,0,0.0,0
@Starbucks,0.0,374.5,5558.5,0,361.5,0,0.0,0
@Wendys,0.0,262.0,1225.0,0,146.0,0,0.0,0
@dominos,0.0,317.0,3888.0,0,249.0,0,0.0,0
@dunkindonuts,0.0,434.0,6883.0,0,390.5,0,0.0,0
@redrobinburgers,0.0,449.0,5362.0,0,325.0,0,0.0,0


In [28]:
without_retweets.shape

(27211, 17)

In [29]:
#a large number of the people tweet regularly at these companies 
without_retweets = without_retweets.drop_duplicates(subset=['Company', 'user_name'])
without_retweets.shape

(17212, 17)

In [33]:
#Characteristics of users tweeting at each company
without_retweets.groupby('Company')[list(food.select_dtypes(include=np.number).columns)[1:-3]].median()

Unnamed: 0_level_0,number_of_people_they_follow,number_of_user_tweets,retweet_count,user_followers_count
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
@ChipotleTweets,332.0,4071.5,0,292.0
@CrackerBarrel,397.5,4012.0,0,272.0
@DennysDiner,261.0,2919.0,0,190.0
@McDonalds,305.0,2787.5,0,219.0
@Potbelly,392.5,4565.5,0,330.0
@Starbucks,359.0,5051.0,0,343.5
@Wendys,259.0,1191.0,0,146.0
@dominos,313.0,3389.0,0,246.0
@dunkindonuts,403.0,5686.0,0,333.0
@redrobinburgers,395.0,3412.0,0,202.0


In [27]:
#words that aren't caught that don't add value to function below
drop_words = ['love','don t', 'twitter', 'don', 'tweets', 'opinions', 'your',
             'born', 'never']

In [21]:
def Profile_Noun_Finder(text):
    """Takes text, eliminates non-alpha characters, and returns all nouns"""
    text = re.sub(r'(https)[^\s]+', '', text)
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    lowered = [word.lower() for word in words]
    words = [word for word in lowered if len(word) > 2]
    words = [word for word in words if word not in drop_words]
    new_text = ' '.join(words)
    wiki = TextBlob(new_text)
    return list(wiki.noun_phrases)

## Positive Tweets

In [88]:
#getting rid of empty profiles
without_retweets = without_retweets.dropna(subset=['user_profile_text'])

#running function on profiles
without_retweets['nouns'] = without_retweets['user_profile_text'].apply(Profile_Noun_Finder)

#subsetting for positive reviews
positive_profiles = without_retweets.loc[without_retweets['sentiment_dummies'] == 1]

#flattening the list
flattened_nouns = [word for parts in positive_profiles['nouns'] for word in parts]

#looking at the 50 most common noun phrases
Counter(flattened_nouns).most_common(50)

In [94]:
#Creating word counts based on np_counts feature of text_blob
texters = ' '.join(positive_profiles['user_profile_text'])
dic_values = TextBlob(texters).np_counts
favorible_words = pd.DataFrame()
favorible_words['words'] = dic_values.keys()
favorible_words['count'] = dic_values.values()
favorible_words.sort_values(by='count', ascending=False)

## Negative Tweets

In [109]:
#subsetting for positive reviews
negative_profiles = without_retweets.loc[without_retweets['sentiment_dummies'] == -1]

#flattening the list
flattened_nouns = [word for parts in negative_profiles['nouns'] for word in parts]

#looking at the 50 most common noun phrases
Counter(flattened_nouns).most_common(50)

[('youtube channel', 7),
 ('social media', 4),
 ('new youtube', 3),
 ('personal account', 3),
 ('full time', 3),
 ('free time', 3),
 ('wife mom', 3),
 ('mhs tfa', 3),
 ('video games', 2),
 ('misanthropic attention whore talk', 2),
 ('music vinylpizza podcast writer', 2),
 ('link bio', 2),
 ('black man', 2),
 ('new things life', 2),
 ('chance waste', 2),
 ('everyday life', 2),
 ('proud mother', 2),
 ('parody account', 2),
 ('new products', 2),
 ('nintendo switch content', 2),
 ('animal lover', 2),
 ('gmail com', 2),
 ('proud father', 2),
 ('cool things', 2),
 ('news junkie animal lover', 2),
 ('culture addict theater', 2),
 ('news senior producer newhousesu grad retweets endorsements', 2),
 ('san diego', 2),
 ('stupid people', 2),
 ('union news press blogs jobs', 2),
 ('simple girl', 2),
 ('gon na', 2),
 ('social justice', 2),
 ('husband father', 2),
 ('wonderful children', 2),
 ('irony creator good music', 2),
 ('bad music keeper', 2),
 ('secret potato burrito sauce', 2),
 ('producer b

In [110]:
#Creating word counts based on np_counts feature of text_blob
texters = ' '.join(negative_profiles['user_profile_text'])
dic_values = TextBlob(texters).np_counts
favorible_words = pd.DataFrame()
favorible_words['words'] = dic_values.keys()
favorible_words['count'] = dic_values.values()
favorible_words.sort_values(by='count', ascending=False)

Unnamed: 0,words,count
29,love,37
149,twitter,27
167,ig,25
572,wife,24
176,god,23
772,instagram,21
538,husband,21
828,’ s,20
484,father,19
397,snapchat,18


# Dunkin vs. Starbucks

In [16]:
dunkin = food.loc[food['Company'] == '@dunkindonuts'] 
dunkin['hour'] = dunkin['time_tweeted'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
dunkin.groupby('hour')['sentiment_score'].mean()

hour
0.0     0.055933
1.0     0.082589
2.0     0.165090
3.0     0.155026
4.0     0.083048
5.0     0.168492
6.0     0.096741
7.0     0.130059
8.0     0.068469
9.0     0.074440
10.0    0.066149
11.0    0.080584
12.0    0.062593
13.0    0.083717
14.0    0.072379
15.0    0.093971
16.0    0.116218
17.0    0.124282
18.0    0.110968
19.0    0.072887
20.0    0.037402
21.0    0.128900
22.0    0.096105
23.0    0.081113
Name: sentiment_score, dtype: float64

In [19]:
positive_dunkin = dunkin.loc[dunkin['sentiment_dummies'] == 1]

In [24]:
dunkin['user_profile_text'] = dunkin['user_profile_text'].fillna(' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
dunkin['nouns'] = dunkin['user_profile_text'].apply(Profile_Noun_Finder)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [48]:
positive_dunkin = dunkin.loc[dunkin['sentiment_dummies'] == 1]
negative_dunkin = dunkin.loc[dunkin['sentiment_dummies'] == -1]

In [45]:
nouns = []
for val in positive_dunkin['nouns']:
    if val:
        words = val[0]
        words = words.split()
        for word in words:
            nouns.append(word)

In [46]:
most_positive_dunk_words = Counter(nouns).most_common(50)

In [49]:
nouns = []
for val in negative_dunkin['nouns']:
    if val:
        words = val[0]
        words = words.split()
        for word in words:
            nouns.append(word)

In [51]:
negative_dunk_words = Counter(nouns).most_common(50)

In [52]:
dunkin

Unnamed: 0,Company,favorite_count,number_of_people_they_follow,number_of_user_tweets,retweet_count,text,time_tweeted,unique_code,user_coordinates,user_followers_count,user_is_verified,user_location,user_name,user_profile_text,text_sentiment,sentiment_score,sentiment_dummies,hour,nouns
5568,@dunkindonuts,0.0,1522.0,3695.0,0,@dunkindonuts #DDLuckyDozen and #Sweepstakes I...,2018-03-13 15:12:18,973637894563758080,,220.0,0,WEST,JOAN DUCAR,,ddluckydozen and sweepstakes i will be spendi...,0.100000,1,15.0,[]
5569,@dunkindonuts,0.0,151.0,6780.0,0,@dunkindonuts drive-thru\nMe: “I’d like a cara...,2018-03-13 15:12:09,973637860791410690,,475.0,0,,JP,"Cast your burden on the Lord, & He will sustai...",drive thru me i d like a caramel iced coffee ...,0.000000,0,15.0,"[cast burden, airports mountains sweaters sock..."
5570,@dunkindonuts,0.0,842.0,19740.0,1,RT @Tessa_Roy: .@GovRaimondo arrives for storm...,2018-03-13 15:12:09,973637858278944768,,10425.0,1,"Providence, RI",NewsTalk 99.7 & AM 630 WPRO,The Voice of Southern New England,tessa_roy govraimondo arrives for storm press ...,0.000000,0,15.0,"[voice southern, new england]"
5571,@dunkindonuts,0.0,601.0,3347.0,1,.@GovRaimondo arrives for storm press conferen...,2018-03-13 15:11:53,973637790683553793,,695.0,0,"Providence, RI",Tessa Roy,"Reporter @wpro, alum @EmersonCollege. Snarky o...",govraimondo arrives for storm press conference...,0.000000,0,15.0,"[reporter wpro alum emersoncollege snarky, own..."
5572,@dunkindonuts,0.0,327.0,14179.0,17,RT @dunkindonuts: Standard. #Snowstorm https:/...,2018-03-13 15:11:39,973637731216773120,,1516.0,0,"NY, NJ",Molly R. Blackwell,"New England raised me, NY is shaping me. ...",standard snowstorm,0.000000,0,15.0,"[new england, cape cod nyc, full picture]"
5573,@dunkindonuts,0.0,840.0,25660.0,17,RT @dunkindonuts: Standard. #Snowstorm https:/...,2018-03-13 15:10:48,973637519186292736,,1058.0,0,Boston,Kevin,I'm just me. There is nothing to it. #Leadersh...,standard snowstorm,0.000000,0,15.0,[leadership horror movie watcher cookie lover]
5574,@dunkindonuts,0.0,287.0,6996.0,290,RT @dunkindonuts: TRUTH https://t.co/0AgykRoIgb,2018-03-13 15:09:06,973637090771685377,,293.0,0,,Melissa Reabe,I like to use the word 'dude' as a noun or an ...,truth,0.000000,0,15.0,[word dude noun adverb adjective]
5575,@dunkindonuts,0.0,656.0,17290.0,0,@dunkindonuts #DDLuckyDozen #Sweepstakes A gre...,2018-03-13 15:08:47,973637013206458368,,169.0,0,,Jane Y.,Have a wonderful day!,ddluckydozen sweepstakes a great combo dd and...,0.800000,1,15.0,[wonderful day]
5576,@dunkindonuts,0.0,67.0,1728.0,0,@dunkindonuts I spend to much money on u,2018-03-13 15:08:21,973636902573281280,,55.0,0,,Allison :),just a girl supporting her ppl. I have inactiv...,i spend to much money on u,0.200000,1,15.0,[inactive followers lol]
5577,@dunkindonuts,0.0,247.0,313.0,290,RT @dunkindonuts: TRUTH https://t.co/0AgykRoIgb,2018-03-13 15:08:07,973636842020069376,,37.0,0,"Rhode Island, USA",Anna Sroczynski,fun fact: will never hit the lotto! ........... 🦋,truth,0.000000,0,15.0,[fun fact]
