# Sentiment Analysis and Consumer Profiling

This script includes cleaning, EDA, feature creation, and some preliminary analysis. 

In [1]:
import pandas as pd
import numpy as np 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib 
import matplotlib.patches as mpatches
import seaborn as sns
from textblob import TextBlob
from collections import Counter
from tqdm import tqdm

In [2]:
food = pd.read_csv('./fastfood.csv', dtype=object, index_col=0)

In [3]:
food.shape

(152378, 14)

In [4]:
#dropping malformed data (invalid index)
food.drop(list(food.loc[food['unique_code'] == 'Nobody should be too big to fail...'].index),
          axis=0, inplace=True)

In [5]:
#dropping duplicate entries
food = food.drop_duplicates(subset='unique_code')

In [6]:
#eliminated more than 50% of observations
food.shape

(66504, 14)

In [7]:
food.reset_index(inplace=True, drop=True)

In [8]:
#dropping all rows with all null values
food = food.drop(food[food.isnull().all(axis=1)].index[0], axis=0).reset_index(drop=True)

In [9]:
#Dropping all values that are not company related (only 3 observations)
to_drop = []

for row_num, val in enumerate(food['Company']):
    if val[0] != '@':
        to_drop.append(row_num)
        
food = food.drop(to_drop, axis=0).reset_index(drop=True)

In [10]:
#replacing strings with integers
mapper = {'True': 1, 'False': 0}
food['user_is_verified'] = food.user_is_verified.map(mapper)

In [11]:
#filling nulls and converting data types
food['retweet_count'] = food.retweet_count.astype(int)

In [12]:
#Set to run Midnight and 5pm EST everyday, the times are in UTC, making EST
food['time_tweeted'] = pd.to_datetime(food['time_tweeted']) - pd.Timedelta(hours=4)

In [13]:
food['favorite_count'] = food.favorite_count.astype(int)
food['number_of_people_they_follow'] = food.number_of_people_they_follow.astype(int)
food['number_of_user_tweets'] = food.number_of_user_tweets.astype(int)
food['user_followers_count'] = food.user_followers_count.astype(int)

In [14]:
#creating a basic name category that isnt the handle

mapper = {'@DennysDiner': 'Dennys', '@ChipotleTweets': 'Chipotle',
         '@McDonalds': 'McDonalds', '@Wendys': 'Wendys', '@Starbucks':'Starbucks',
         '@dunkindonuts':'Dunkin_Donuts', '@dominos': 'Dominos', '@shakeshack': 'Shake_Shack',
         '@sonicdrivein': 'Sonic', '@wingstop': 'Wingstop', '@CrackerBarrel': 'Cracker_Barrel', 
         '@redrobinburgers': 'Red_Robin', '@Potbelly': 'Potbelly'}

food['name'] = food.Company.map(mapper)

Examined multiple methods of cleaning for sentiment data, only the final method runs now, but 
have kept my trial methods below (hashed out)

In [15]:
#shouldnt be removing stopwords before sentiment analysis:
#http://www.lrec-conf.org/proceedings/lrec2014/pdf/292_Paper.pdf
#testing different functions for preprocessing text for sentiment analysis


def Text_Cleaner(text, tokens=False):
    """Takes text, eliminates URLS, replaces contractions, tokenizes, 
    removes company names, lower cases, removes calls to twitter handles, 
    returns a string, same as version 1, but only looking at words"""
    text = re.sub(r'(https)[^\s]+', '', text)
    text = re.sub(r'can\'t', 'can not', text, flags=re.IGNORECASE)
    text = re.sub(r'don\'t', 'do not', text, flags=re.IGNORECASE)
    text = re.sub(r'isn\'t', 'is not', text, flags=re.IGNORECASE)
    text = re.sub(r'aren\'t', 'are not', text, flags=re.IGNORECASE)
    text = re.sub(r'wasn\'t', 'was not', text, flags=re.IGNORECASE)
    text = re.sub(r'weren\'t', 'were not', text, flags=re.IGNORECASE)
    text = re.sub(r'haven\'t', 'have not', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(rt|RT)', '', text)
    text = re.sub(r'@[a-zA-Z0-9]+', '', text)
    text = re.sub('#', '', text)
    text = re.sub(r'(wtf)+\b', 'what the fuck', text, flags=re.IGNORECASE)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    if tokens:
        words = [word.lower() for word in words]
        return words
    return ' '.join(words)

# def Text_Cleaner_version_1(text):
#     """Takes text, eliminates URLS, replaces contractions, tokenizes, 
#     removes company names, lower cases, removes calls to twitter handles, 
#     returns a string"""
#     text = re.sub(r'(https)[^\s]+', '', text)
#     text = re.sub(r'\b(rt|RT)', '', text)
#     text = re.sub(r'@[a-zA-Z0-9]+', '', text)
#     text = re.sub('#', '', text)
#     return text

#     LOOKING AT VARIOUS METHODS FOR PREPROCESSING FOR SENTIMENT ANALYSIS
#     tokenizer = RegexpTokenizer(r'\w+')
#     words = tokenizer.tokenize(text)
#     lower = [x.lower() for x in words]
#     words = [word for word in words if word != 'rt']
#     eliminator = [re.sub(r'(mcdon|dunki|denn|redro|sonic|starb|shakesh|domino|crackerb|chipot|wend)[a-z]+','',x)
#                   for x in lower]
#     return ' '.join(eliminator2)

In [16]:
# #Creating a test set of uncleaned data to check the value of the Text_Cleaner functions
# food['text_sentiment_no_clean'] = food['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [17]:
# #converting text with version 1
# food['text_sentiment_v1'] = food['text'].apply(Text_Cleaner_version_1)
# #Calculating sentiment with TextBlob
# food['sentiment_score_v1'] = food['text_sentiment_v1'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [18]:
#converting text with version 2
food['text_sentiment'] = food['text'].apply(Text_Cleaner)
#Calculating sentiment with TextBlob
food['sentiment_score'] = food['text_sentiment'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [19]:
#Looking at overall sentiment by company (version 1) (0 neutral, 1 positive, -1 negative)
food.groupby('Company')['sentiment_score'].mean().sort_values()

Company
@dominos            0.065029
@dunkindonuts       0.094908
@wingstop           0.097293
@sonicdrivein       0.101714
@McDonalds          0.106599
@Wendys             0.108249
@DennysDiner        0.113954
@redrobinburgers    0.121663
@ChipotleTweets     0.147069
@CrackerBarrel      0.153147
@Potbelly           0.162750
@Starbucks          0.173199
@shakeshack         0.184626
Name: sentiment_score, dtype: float64

In [20]:
#creating sentiment dummy variables 

def dummy_maker(val):
    """Takes in a float and returns a dummy based on the value
    to be used in pandas.apply"""
    if val == 0:
        return 0
    elif val > 0:
        return 1
    else:
        return -1

food['sentiment_dummies'] = food['sentiment_score'].apply(dummy_maker)

# food['sentiment_dummies_v2'] = food['sentiment_score_v2'].apply(dummy_maker)

# food['sentiment_dummies_uncleaned'] = food['text_sentiment_no_clean'].apply(dummy_maker)

In [21]:
# print(food['sentiment_dummies_v1'].value_counts()) 
# print(food['sentiment_dummies_v2'].value_counts())
print(food['sentiment_dummies'].value_counts())

 1    28738
 0    27902
-1     9860
Name: sentiment_dummies, dtype: int64


In [22]:
#manually testing reliability of the different measures with a random subset
random_numbers = list(np.random.randint(0, 62415, 5))
for num, val in enumerate(food.iloc[random_numbers, :]['text']):
    print(val)
    print('\n')
    print(food.iloc[[num], :]['sentiment_dummies'].values)

i want choco butternut 😋 @dunkindonuts 🍩


[1]
RT @ChipotleTweets: Yes, guac is extra:
E-nticing
X-quisite
T-empting
R-eally yummy
A-lways a good idea https://t.co/KDbNHCDtG8


[0]
There is nothing like @ChipotleTweets on a rainy day 👌


[-1]
@redrobinburgers Look, I am not sure who your target consumer *is but the vocal fry of your current ad makes your m… https://t.co/CosaqTbwtc


[0]
RT @DennysDiner: pepper is just emo salt


[1]


In [23]:
#function to quickly separate positive/negative tweets by company

def negativity_formatter(company, hourly_rate=False):
    """This function accepts a company and returns either 
    the separate company, positive, negative dataframes (in that order)
    OR the same order plus a dataframe of hourly rates, if hourly_rate = True"""
    df = food.loc[food['name'] == company]
    positive_df = df.loc[df['sentiment_dummies'] == 1]
    negative_df = df.loc[df['sentiment_dummies'] == -1]
    if not hourly_rate:
        return df, positive_df, negative_df
    else:
        rate_df = pd.DataFrame()
        rate_df['pos_count'] = positive_df.groupby(positive_df['time_tweeted'].dt.hour)['Company'].count()
        rate_df['neg_count'] = negative_df.groupby(negative_df['time_tweeted'].dt.hour)['Company'].count()
        rate_df['rate'] = rate_df['neg_count'] / (rate_df['pos_count'] + rate_df['neg_count'])
        return rate_df
        

In [24]:
#43% of tweets are retweets
food.loc[food['text'].str[:2] == 'RT'].shape[0] / food.shape[0]

0.43109774436090226

In [25]:
#Creating a dummy for whether the tweet is a retweet or not
retweets = []
for val in food['text']:
    if val[:2] == 'RT':
        retweets.append(1)
    else: 
        retweets.append(0)
food['is_a_retweet'] = retweets

In [26]:
#getting rid of tweets without a user 
food = food.drop(list(food.loc[(food['user_name'].isnull())].index), axis=0)

In [27]:
#dropping regional affiliates
associated_comps = []
names = []
for num, name in enumerate(food['user_name']):
    test = re.findall(r'\A(mcdon|dunki|redro|starbuc|shakesh|domino|crackerb)[a-z]+'
                      , name, flags=re.IGNORECASE)
    if test:
        if name not in ['Dunkin Fails', 'Dunkin Kitti', "McDonald's employee"]:
            associated_comps.append(num)
            names.append(name)
            
food = food.drop(associated_comps, axis=0)

In [28]:
#favorite count is heavilly skewed, but keep for now
food['favorite_count'].describe()

count    66441.000000
mean         0.508662
std          5.554575
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        857.000000
Name: favorite_count, dtype: float64

## Beginning to look at user profiles by company

In [29]:
#customer profiles still skewed by large means- looking at medians & means
#creating a subset to examine the numeric characteristics of each company's customers
individual_users = food.drop_duplicates(subset='user_name')

customer_numeric_df = individual_users.groupby('name').agg({'favorite_count': ['mean', 'max'], 
                                      'number_of_people_they_follow': ['median', 'mean'],
                                      'number_of_user_tweets': ['median', 'mean'],
                                      'retweet_count': ['median', 'mean'],
                                      'user_followers_count': ['median', 'mean'],
                                       'user_is_verified': ['mean'],
                                      'sentiment_score': ['mean'], 
                                      'is_a_retweet': 'mean',
                                      'Company': 'count'})

customer_numeric_df.columns = [' '.join(col).strip() for col in customer_numeric_df.columns.values]

In [30]:
#Not enough observations for Potbelly (684), RedRobin (1463), dropping
food = food.drop(list(food.loc[(food['Company']== '@redrobinburgers')|
                   (food['Company']== '@Potbelly')].index), axis=0)
food = food.reset_index(drop=True)

In [31]:
#MISSING VALUES FOR CERTAIN COMPANIES AT CERTAIN HOURS- hourly is flawed
# negativity_by_comp = pd.DataFrame()
# for val in food.name.unique().tolist():
#     if val != 'McDonalds':
#         values = list(negativity_formatter(val, hourly_rate=True)['rate'].values)
#         negativity_by_comp[val] = values

In [32]:
food['day_date'] = food['time_tweeted'].dt.day
food['weekday'] = food['time_tweeted'].dt.weekday

## Beginning look at stock movements and sentiment

In [33]:
#specifying trading days with sufficient information range
stock_analysis = food.loc[(food['day_date'] > 11) & (food['day_date'] < 31)]

In [34]:
stock_analysis.shape

(63463, 21)

In [35]:
stocks = pd.read_csv('./twitter_stocks.csv')

In [36]:
stocks.drop(['High', 'Low', 'Adj Close'], axis=1, inplace=True)

In [37]:
stocks['change'] = stocks['Close'] - stocks['Open']

In [38]:
#creating datetime, and locating common trading dates
stocks['Date'] = pd.to_datetime(stocks.Date)
stocks['day'] = stocks['Date'].dt.day

In [39]:
stocks['day'] = stocks['Date'].dt.day

In [40]:
stocks = stocks.loc[(stocks['day'] > 11) & (stocks['day'] < 31)]

In [41]:
#making sure values match before merging
stocks['Name'] = stocks.Name.str.replace('Shack Shack', 'Shake_Shack')
stocks['Name'] = stocks.Name.str.replace('Cracker Barrel', 'Cracker_Barrel')
stocks['Name'] = stocks.Name.str.replace('Dunkin Donuts', 'Dunkin_Donuts')

In [42]:
#grouping sentiments to merge
stock_analysis = stock_analysis.groupby(['day_date', 'name'], as_index=False).agg(
                                                    {'sentiment_score':'mean',
                                                       'Company':'count'})

stock_analysis.columns =  ['day_date', 'name', 'sentiment_score', 'num_observations']

In [43]:
merged_stock = pd.merge(stock_analysis, stocks, left_on=['day_date', 'name'], right_on=['day', 'Name'])

In [44]:
#dropping redundant columns 
merged_stock.drop(['day', 'Name'], axis=1, inplace=True)

In [45]:
merged_stock = merged_stock.sort_values(['name', 'day_date'])

In [46]:
#correlations between sentiment_score and various other indicators (volume/change/market_cap)
merged_stock.corr()['sentiment_score']

day_date           -0.094365
sentiment_score     1.000000
num_observations   -0.013644
Open               -0.014959
Close              -0.012543
Volume              0.145016
Market Cap          0.089305
change              0.102586
Name: sentiment_score, dtype: float64

## Looking at Profiles of negative/positive sentiments

In [47]:
#set of english vocabulary
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

In [48]:
def lang_composition(company):
    """Takes a company name and returns the total number of words present, 
    the number of unique words present, and the number of those words that are in the English
    language"""
    all_tweets, positive, negative = negativity_formatter(company)
    all_text = ' '
    pos_text = ' '
    neg_text = ' '
    for val in all_tweets.text:
        if val[:2] != 'RT':
            all_text = all_text + ' ' + val
    for val in positive.text:
        if val[:2] != 'RT':
            pos_text = pos_text + ' ' + val
    for val in negative.text:
        if val[:2] != 'RT':
            neg_text = neg_text + ' ' + val
    all_words = Text_Cleaner(all_text, tokens = True)
    pos_words = Text_Cleaner(pos_text, tokens = True)
    neg_words = Text_Cleaner(neg_text, tokens = True)
    info = [company]
    info.extend([len(all_words), len(pos_words), len(neg_words)])
    info.extend([len(set(all_words)), len(set(pos_words)), len(set(neg_words))])
    all_clean_words = [word for word in set(all_words) if word in english_vocab]
    pos_clean_words = [word for word in set(pos_words) if word in english_vocab]
    neg_clean_words = [word for word in set(neg_words) if word in english_vocab]
    info.extend([len(all_clean_words), len(pos_clean_words), len(neg_clean_words)])
    return info

In [49]:
#getting language composition for all companies
all_comps = []
for name in list(food.name.unique()):
    all_comps.append(lang_composition(name))
    
lang_comp_df = pd.DataFrame(all_comps)

In [50]:
#renaming all columns
lang_comp_df.columns = ['name', 'all_words','pos_all_words', 'neg_all_words', 'unique_words', 'pos_unique_words', 'neg_unique_words', 
                        'english_words', 'pos_english_words', 'neg_english_words']

In [51]:
#creating percentages for analysis
lang_comp_df['percent_unique'] = lang_comp_df['unique_words'] / lang_comp_df['all_words'] * 100
lang_comp_df['percent_english'] = lang_comp_df['english_words'] / lang_comp_df['unique_words'] * 100
lang_comp_df['pos_percent_unique'] = lang_comp_df['pos_unique_words'] / lang_comp_df['pos_all_words'] * 100
lang_comp_df['pos_percent_english'] = lang_comp_df['pos_english_words'] / lang_comp_df['pos_unique_words'] * 100
lang_comp_df['neg_percent_unique'] = lang_comp_df['neg_unique_words'] / lang_comp_df['neg_all_words'] * 100
lang_comp_df['neg_percent_english'] = lang_comp_df['neg_english_words'] / lang_comp_df['neg_unique_words'] * 100

In [52]:
#Merging sentiments 
lang_comp_df = pd.merge(lang_comp_df, food.groupby('name', as_index=False)['sentiment_score'].mean(), 
       on='name')

#Merging only negative sentiments 
food_neg = food.loc[food['sentiment_dummies'] == -1].groupby('name',as_index=False)['sentiment_score'].mean()
food_neg.columns = ['name', 'neg_sentiment']
lang_comp_df = pd.merge(lang_comp_df, food_neg, on='name')

#Merging only positive sentiments
food_pos = food.loc[food['sentiment_dummies'] == 1].groupby('name',as_index=False)['sentiment_score'].mean()
food_neg.columns = ['name', 'pos_sentiment']
lang_comp_df = pd.merge(lang_comp_df, food_pos, on='name')

In [53]:
#strongest correlations with overall sentiment- percent unique (.31), percent English (0.29)
lang_comp_df.corr()['sentiment_score_x']

all_words             -0.453531
pos_all_words         -0.344574
neg_all_words         -0.553493
unique_words          -0.280634
pos_unique_words      -0.199107
neg_unique_words      -0.434417
english_words         -0.243947
pos_english_words     -0.180855
neg_english_words     -0.383851
percent_unique         0.311866
percent_english        0.298892
pos_percent_unique     0.193846
pos_percent_english    0.153271
neg_percent_unique     0.532893
neg_percent_english    0.581626
sentiment_score_x      1.000000
neg_sentiment          0.536657
sentiment_score_y      0.512859
Name: sentiment_score_x, dtype: float64

## Looking at company tweets

In [54]:
company_tweets = pd.read_csv('./company_tweets.csv', index_col=0)

In [55]:
company_tweets.shape

(10920, 11)

In [56]:
company_tweets['time_tweeted'] = pd.to_datetime(company_tweets['time_tweeted'])

In [57]:
#there are no duplicates
company_tweets.unique_code.drop_duplicates(inplace=True)

In [58]:
#dropping potbelly & redrobin
company_tweets.drop(list(company_tweets.loc[(company_tweets['name'] == '@redrobinburgers')
                  |(company_tweets['name'] == '@Potbelly')].index), axis=0,
                   inplace=True)

In [59]:
#getting normally spelled names
mapper = {'@DennysDiner': 'Dennys', '@ChipotleTweets': 'Chipotle',
         '@McDonalds': 'McDonalds', '@Wendys': 'Wendys', '@Starbucks':'Starbucks',
         '@dunkindonuts':'Dunkin_Donuts', '@dominos': 'Dominos', '@shakeshack': 'Shake_Shack',
         '@sonicdrivein': 'Sonic', '@wingstop': 'Wingstop', '@CrackerBarrel': 'Cracker_Barrel', 
         '@redrobinburgers': 'Red_Robin', '@Potbelly': 'Potbelly'}

company_tweets['Company'] = company_tweets.name.map(mapper)

In [60]:
food.time_tweeted.max()

Timestamp('2018-03-30 09:48:49')

In [61]:
company_tweets.reset_index(drop=True, inplace=True)

In [62]:
#dropping all company tweets that occurred after consumer tweet collection stopped
company_tweets.drop(list(
    company_tweets.loc[company_tweets['time_tweeted'] > food.time_tweeted.max()].index), axis=0, inplace=True)

In [63]:
print(company_tweets.shape)
company_tweets = company_tweets.reset_index(drop=True)

(8080, 12)


In [64]:
#dropping values that are before consumer observations
company_tweets.drop(list(company_tweets.loc[company_tweets['time_tweeted'] < 
                                       food.time_tweeted.min()].index), inplace=True)

In [65]:
company_tweets = company_tweets.reset_index(drop=True)

In [66]:
#Getting the hashtags for each company
hashtags = {}
callnames = {}
for company in tqdm(list(company_tweets.Company.unique())): 
    df = company_tweets.loc[company_tweets['Company'] == company]
    hash_base = []
    call_base = []
    for val in df['text']:
        hashes = re.findall(r'#[A-Za-z0-9]+\b', val)
        calls = re.findall(r'@[A-Za-z0-9]+\b', val)
        hash_base.extend(hashes)
        call_base.extend(calls)
    hashtags[company] = hash_base
    callnames[company] = call_base

100%|██████████| 11/11 [00:00<00:00, 157.47it/s]


In [67]:
tokenizer = RegexpTokenizer(r'#[A-Za-z0-9]+\b')
#Only interested in customers independently using the hashtags
hash_check = food.loc[food['is_a_retweet'] == 0]
hash_check['text'] = hash_check.text.apply(lambda x: tokenizer.tokenize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [68]:
#Identifying customer use of company hashtags

company_counts = {}
for company in list(company_tweets.Company.unique()):
    hash_comp = hash_check.loc[hash_check['name'] == company]
    hashes = {}
    for val in hash_comp['text']:
        for word in val:
            if word in hashtags[company]:
                try: 
                    hashes[word] += 1
                except:
                    hashes[word] = 1
    company_counts[company] = hashes

In [69]:
#putting statistics on hashtags/use into new company profile dataframe
new_df = pd.DataFrame(company_counts).T.sum(axis=1).reset_index()
new_df.columns = ['name', 'customer_hash_use']
new_df['customer_unique_hashes'] = pd.DataFrame(company_counts).T.count(axis=1).values

In [70]:
#examining company specific twitter behavior
comp_hash_uses = []
comp_unique_hashes = []
comp_handle_uses = []
comp_unique_handle_uses = []
for company in new_df.name.tolist():
    comp_hash_uses.append(len(hashtags[company]))
    comp_unique_hashes.append(len(set(hashtags[company])))
    comp_handle_uses.append(len(callnames[company]))
    comp_unique_handle_uses.append(len(set(callnames[company])))

new_df['comp_hash_uses'] = comp_hash_uses
new_df['comp_unique_hashes'] = comp_unique_hashes
new_df['comp_handle_uses'] = comp_handle_uses
new_df['comp_unique_handle_uses'] = comp_unique_handle_uses

In [71]:
#grouping company tweet info
comp_grouped = company_tweets.groupby('Company', as_index=False).mean().drop(
                            ['is_a_retweet', 'is_quote_status', 'unique_code'], axis=1)

In [72]:
comp_merged = pd.merge(comp_grouped, new_df, left_on='Company', right_on='name')
comp_merged.drop('name', axis=1, inplace=True)

In [73]:
comp_merged['market_cap'] = merged_stock.groupby('name')['Market Cap'].mean().values

In [74]:
comp_merged['sentiment'] = food.groupby('name')['sentiment_score'].mean().values

In [75]:
print('Sentiment Correlations')
comp_merged.corr()['sentiment'].sort_values()[:-1]

Sentiment Correlations


comp_unique_handle_uses   -0.154493
retweet_count             -0.072436
number_of_tweets_total    -0.070891
comp_hash_uses            -0.041042
customer_unique_hashes     0.046609
comp_unique_hashes         0.091145
market_cap                 0.110416
comp_handle_uses           0.251164
customer_hash_use          0.303470
favorite_count             0.311198
company_followers_count    0.333409
followers_count            0.333409
Name: sentiment, dtype: float64

## Day of week optimizer

In [130]:
#ensuring that every weekday has equal representation (Friday will be slightly short-changed (by ~3 hours)
day_week = food.loc[(food['time_tweeted'] > '2018-03-09')]

In [131]:
#converting day nums to names, grouping by company and day of week and aggregating. 
mapper = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
day_week = day_week.groupby(['name', 'weekday'], as_index=False).agg({'sentiment_score':'mean', 'user_name': 'count',
                                                          'retweet_count':'mean', 'favorite_count':'mean'})

day_week['weekday'] = day_week['weekday'].map(mapper)

In [132]:
#including average sentiment for visualization
merge = day_week.groupby('weekday', as_index=False)['sentiment_score'].mean()
merge.columns = ['weekday', 'avg_sentiment']
day_week = pd.merge(day_week, merge, on='weekday')

In [135]:
print('number of tweets by day')
day_week.groupby('weekday')['user_name'].sum().sort_values()

number of tweets by day


weekday
Sunday        4345
Saturday      7259
Monday        7549
Wednesday     9024
Friday        9840
Thursday     12736
Tuesday      13541
Name: user_name, dtype: int64

In [136]:
print('average sentiment by day')
day_week.groupby('weekday')['sentiment_score'].mean().sort_values()

average sentiment by day


weekday
Saturday     0.089578
Sunday       0.100064
Friday       0.116223
Thursday     0.116878
Monday       0.125332
Wednesday    0.131134
Tuesday      0.156336
Name: sentiment_score, dtype: float64

## Beginning profile analysis

In [228]:
profiles = food.dropna(subset=['user_profile_text']).reset_index(drop=True)

In [229]:
profiles.shape

(54646, 21)

In [230]:
#only interested in unique profiles
profiles = profiles.drop_duplicates(subset='user_name').reset_index(drop=True)

In [231]:
profiles.shape

(40482, 21)

In [232]:
#dropping columns I don't need for this
profiles.drop(['user_coordinates', 'unique_code', 'day_date', 'text_sentiment'], axis=1, inplace=True)

In [233]:
#adding an hour dummy
profiles['hour'] = profiles.time_tweeted.dt.hour

In [234]:
#getting rid of columns I don't need anymore
profiles = profiles.drop(['time_tweeted', 'Company'], axis=1)

In [235]:
#Maybe the number of hashtags a person uses is indicative of their personality/disposition towards the product
def num_hashes(text):
    """Counts the number of hashtags in the profile"""
    text = text.split()
    num_hashes = 0
    for word in text:
        if word[0] == '#':
            num_hashes = num_hashes + 1
    return num_hashes

profiles['number_of_hashes'] = profiles['user_profile_text'].apply(num_hashes)

In [236]:
#8% of users use a hashtag in their profile
np.sum(profiles.number_of_hashes > 1) / profiles.shape[0]

0.08248110271231658

In [237]:
#adding a dummy variable if the user lists a location, possibly indicator
profiles['lists_location'] = profiles.user_location.isnull().astype(int)

#unreliable for now to include
profiles = profiles.drop('user_location', axis=1)

In [238]:
def cleaning_additional(text):
    """standardizing contractions, removing urls, removing retweet indicators, removing handles"""
    text = re.sub(r'https[\S]+', ' ', text)
    text = re.sub(r'can\'t', 'can not', text, flags=re.IGNORECASE)
    text = re.sub(r'don\'t', 'do not', text, flags=re.IGNORECASE)
    text = re.sub(r'isn\'t', 'is not', text, flags=re.IGNORECASE)
    text = re.sub(r'aren\'t', 'are not', text, flags=re.IGNORECASE)
    text = re.sub(r'wasn\'t', 'was not', text, flags=re.IGNORECASE)
    text = re.sub(r'weren\'t', 'were not', text, flags=re.IGNORECASE)
    text = re.sub(r'haven\'t', 'have not', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(rt|RT)', ' ', text)
    text = re.sub(r'@[\S]+', ' ', text)
    text = re.sub(r'[0-9]*[a-zA-Z]+[0-9]+[a-zA-Z]*[0-9]*[a-zA-Z]*', ' ', text)
    return text

profiles['cleaned'] = profiles.user_profile_text.apply(cleaning_additional)

In [239]:
def punctuation_cleaner(text):
    """Removes punctuation and places spaces"""
    text = re.sub(r'\\n', ' ', text)      
    text = re.sub(r'[!|?|.|,|(|)|||[|]|/|\\|-]', ' ', text)
    return text

profiles['cleaned'] = profiles.cleaned.apply(punctuation_cleaner)

In [240]:
def split_hashes(text):
    """Turns camel case hashtags into separate readable words"""
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    split2 = []
    for val in words:
        if val[0] == '#':
            words = ' '.join(re.findall('[A-Z][^A-Z]*', val))
            split2.append(words)
        else:
            split2.append(val)
    return ' '.join(split2)

profiles['without_hashes'] = profiles.cleaned.apply(split_hashes)

40452    22 Gym Rat EliteGrinders_ pub stomper in ranke...
40453                           what is happiness POG Life
40454    The Remy part of the Mason Remy Alabama Show M...
40455    For we are his workmanship created in Christ J...
40456    Unapologetically honest I do not care if you l...
40457    DB Coach American Heritage DB Coach SouthFlori...
40458                                      Rest Easy Momma
40459    A body and perpetual optimist Inform inspire e...
40460                               Gotta Sparkle to Shine
40461    Hip modern vegan rabbi living and working in N...
40462    retired civil servant politics coach youth bas...
40463                                                 AVHS
40464    HTTR ItTakesEverything TarHeelNation Only The ...
40465                                      Grafton 20 Kate
40466    Foodie MtG Player who loves Counting to 10 or ...
40467    Just a person trying to figure this whole thin...
40468    love to read act like a child most days intere.

In [44]:
#words that aren't caught that don't add value to function below
drop_words = ['love','don t', 'twitter', 'don', 'tweets', 'opinions', 'your',
             'born', 'never']

In [45]:
def Profile_Noun_Finder(text):
    """Takes text, eliminates non-alpha characters, and returns all nouns"""
    text = re.sub(r'(https)[^\s]+', '', text)
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    lowered = [word.lower() for word in words]
    words = [word for word in lowered if len(word) > 2]
    words = [word for word in words if word not in drop_words]
    new_text = ' '.join(words)
    wiki = TextBlob(new_text)
    return list(wiki.noun_phrases)

## Positive Tweets

In [88]:
#getting rid of empty profiles
without_retweets = without_retweets.dropna(subset=['user_profile_text'])

#running function on profiles
without_retweets['nouns'] = without_retweets['user_profile_text'].apply(Profile_Noun_Finder)

#subsetting for positive reviews
positive_profiles = without_retweets.loc[without_retweets['sentiment_dummies'] == 1]

#flattening the list
flattened_nouns = [word for parts in positive_profiles['nouns'] for word in parts]

#looking at the 50 most common noun phrases
Counter(flattened_nouns).most_common(50)

In [94]:
#Creating word counts based on np_counts feature of text_blob
texters = ' '.join(positive_profiles['user_profile_text'])
dic_values = TextBlob(texters).np_counts
favorible_words = pd.DataFrame()
favorible_words['words'] = dic_values.keys()
favorible_words['count'] = dic_values.values()
favorible_words.sort_values(by='count', ascending=False)

## Negative Tweets

In [109]:
#subsetting for positive reviews
negative_profiles = without_retweets.loc[without_retweets['sentiment_dummies'] == -1]

#flattening the list
flattened_nouns = [word for parts in negative_profiles['nouns'] for word in parts]

#looking at the 50 most common noun phrases
Counter(flattened_nouns).most_common(50)

[('youtube channel', 7),
 ('social media', 4),
 ('new youtube', 3),
 ('personal account', 3),
 ('full time', 3),
 ('free time', 3),
 ('wife mom', 3),
 ('mhs tfa', 3),
 ('video games', 2),
 ('misanthropic attention whore talk', 2),
 ('music vinylpizza podcast writer', 2),
 ('link bio', 2),
 ('black man', 2),
 ('new things life', 2),
 ('chance waste', 2),
 ('everyday life', 2),
 ('proud mother', 2),
 ('parody account', 2),
 ('new products', 2),
 ('nintendo switch content', 2),
 ('animal lover', 2),
 ('gmail com', 2),
 ('proud father', 2),
 ('cool things', 2),
 ('news junkie animal lover', 2),
 ('culture addict theater', 2),
 ('news senior producer newhousesu grad retweets endorsements', 2),
 ('san diego', 2),
 ('stupid people', 2),
 ('union news press blogs jobs', 2),
 ('simple girl', 2),
 ('gon na', 2),
 ('social justice', 2),
 ('husband father', 2),
 ('wonderful children', 2),
 ('irony creator good music', 2),
 ('bad music keeper', 2),
 ('secret potato burrito sauce', 2),
 ('producer b

In [110]:
#Creating word counts based on np_counts feature of text_blob
texters = ' '.join(negative_profiles['user_profile_text'])
dic_values = TextBlob(texters).np_counts
favorible_words = pd.DataFrame()
favorible_words['words'] = dic_values.keys()
favorible_words['count'] = dic_values.values()
favorible_words.sort_values(by='count', ascending=False)

Unnamed: 0,words,count
29,love,37
149,twitter,27
167,ig,25
572,wife,24
176,god,23
772,instagram,21
538,husband,21
828,’ s,20
484,father,19
397,snapchat,18


# Dunkin vs. Starbucks

In [16]:
dunkin = food.loc[food['Company'] == '@dunkindonuts'] 
dunkin['hour'] = dunkin['time_tweeted'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
dunkin.groupby('hour')['sentiment_score'].mean()

hour
0.0     0.055933
1.0     0.082589
2.0     0.165090
3.0     0.155026
4.0     0.083048
5.0     0.168492
6.0     0.096741
7.0     0.130059
8.0     0.068469
9.0     0.074440
10.0    0.066149
11.0    0.080584
12.0    0.062593
13.0    0.083717
14.0    0.072379
15.0    0.093971
16.0    0.116218
17.0    0.124282
18.0    0.110968
19.0    0.072887
20.0    0.037402
21.0    0.128900
22.0    0.096105
23.0    0.081113
Name: sentiment_score, dtype: float64

In [19]:
positive_dunkin = dunkin.loc[dunkin['sentiment_dummies'] == 1]

In [24]:
dunkin['user_profile_text'] = dunkin['user_profile_text'].fillna(' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
dunkin['nouns'] = dunkin['user_profile_text'].apply(Profile_Noun_Finder)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [48]:
positive_dunkin = dunkin.loc[dunkin['sentiment_dummies'] == 1]
negative_dunkin = dunkin.loc[dunkin['sentiment_dummies'] == -1]

In [45]:
nouns = []
for val in positive_dunkin['nouns']:
    if val:
        words = val[0]
        words = words.split()
        for word in words:
            nouns.append(word)

In [46]:
most_positive_dunk_words = Counter(nouns).most_common(50)

In [49]:
nouns = []
for val in negative_dunkin['nouns']:
    if val:
        words = val[0]
        words = words.split()
        for word in words:
            nouns.append(word)

In [51]:
negative_dunk_words = Counter(nouns).most_common(50)