# Text Analysis - Project
Scarping twitter data to determine if iPhone or Android users are switching phones

In [1]:
import pandas as pd
import numpy as np
import re

df1 = pd.read_csv("tweets.csv")
df2 = pd.read_csv("tweet1.csv")
df3 = pd.read_csv("tweet2.csv")
df4 = pd.read_csv("tweet3.csv")
df5 = pd.read_csv("tweet4.csv")
df6 = pd.read_csv("tweet5.csv")
df7 = pd.read_csv("tweet6.csv")
df8 = pd.read_csv("tweet7.csv")
df9 = pd.read_csv("tweet8.csv")
df10 = pd.read_csv("tweet9.csv")
df11 = pd.read_csv("tweet10.csv")
df12 = pd.read_csv("tweet11.csv")

In [2]:
# Function to make all tweet commetns lowercase
def lowercase(s):
    return s.lower()

# Function to only change the word counts to be 1 per comments
def count_fix(s):
    if s > 1:
        return 1
    else:
        return s

# Function to sum the mention counts of each word
def sum_words(s):
    return sum(s)

# Function to determine if it's a tweet or re-tweet
def cut_rt(s):
    if s[:2] == 'rt':
        return 1
    else:
        return 0

In [3]:
# Dropping any duplicate tweets based on 'Tweet Id' after joining all dataframes
df = pd.concat([df1, df2, df3, df4, df5, df7, df8, df9, df10, df11, df12])
df.drop_duplicates(inplace = True)
df = df.drop(['Unnamed: 0'], axis=1)

df["Tweet Text"] = df["Tweet Text"].apply(lowercase)

# df now only contains tweets coming from an iPhone or an Android
mask = (df["Source"] == "Twitter for iPhone") | (df["Source"] == "Twitter for Android")
df = df[mask]

# Creating dataframes to separate what iPhone users and Android users are talking about
# Used for lift of each phone against top 10 attributes
iphone_mask = df["Source"] == "Twitter for iPhone"
android_mask = df["Source"] == "Twitter for Android"
df_iphone = df[iphone_mask]
df_android = df[android_mask]

# Creates dataframes for original tweets and retweets
df["RT"] = df["Tweet Text"].map(cut_rt)
no_rt_mask = (df["RT"] == 0)
rt_mask = (df["RT"] == 1)
df_no_rt = df[no_rt_mask].copy(deep=True)
df_rt = df[rt_mask].copy(deep=True)

In [4]:
#iPhone
from sklearn.feature_extraction.text import CountVectorizer

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')

# transform the vectorized count
Count_data = CountVec.fit_transform(df_iphone["Tweet Text"])
 
#create dataframe of the counts for each word
cv_iphone = pd.DataFrame(Count_data.toarray(), columns = CountVec.get_feature_names())

# changing the counts to just count words once per comment
iphone_count = cv_iphone.applymap(count_fix)

# summing the word frequencies
iphone_freqs = iphone_count.apply(sum_words).sort_values(ascending = False)

# storing frequency counts in csv file
iphone_freqs.to_csv('iphone_frequencies.csv')
iphone_freqs[:10]



iphone         10497
12             10402
rt              8184
https           4825
pro             3607
apple           3516
appleevent      2005
a14             1795
bionic          1784
introducing     1740
dtype: int64

In [5]:
#Android
from sklearn.feature_extraction.text import CountVectorizer

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')

# transform the vectorized count
Count_data = CountVec.fit_transform(df_android["Tweet Text"])
 
#create dataframe of the counts for each word
cv_android = pd.DataFrame(Count_data.toarray(), columns = CountVec.get_feature_names())

# changing the counts to just count words once per comment
android_count = cv_android.applymap(count_fix)

# summing the word frequencies
android_freqs = android_count.apply(sum_words).sort_values(ascending = False)

# storing frequency counts in csv file
android_freqs.to_csv('android_frequencies.csv')
android_freqs[:10]



iphone        5552
12            5472
rt            5245
https         2815
apple         1753
pro           1579
appleevent     828
a14            769
bionic         760
chip           733
dtype: int64

## Lift

In [6]:
# We picked these top attributes that all iPhone and Android users are talking about
top_atts = ["new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]

In [7]:
# calculate lift function - iPhone
def calc_lift_iphone(a, b):
    total_size = len(df_iphone)
    filter_a = df_iphone[df_iphone["Tweet Text"].str.contains(a)]
    num_a = len(filter_a)
    num_b = len(df_iphone[df_iphone["Tweet Text"].str.contains(b)])
    num_a_b = len(filter_a['Tweet Text'][filter_a['Tweet Text'].str.contains(b)])
    return total_size * float(num_a_b) / float(num_a * num_b)

# calculate lift function - Android
def calc_lift_android(a, b):
    total_size = len(df_android)
    filter_a = df_android[df_android["Tweet Text"].str.contains(a)]
    num_a = len(filter_a)
    num_b = len(df_android[df_android["Tweet Text"].str.contains(b)])
    num_a_b = len(filter_a['Tweet Text'][filter_a['Tweet Text'].str.contains(b)])
    return total_size * float(num_a_b) / float(num_a * num_b)

In [8]:
# Calculating lift between iPhone and Android
def calc_lift(a, b):
    total_size = len(df)
    filter_a = df[df["Tweet Text"].str.contains(a)]
    num_a = len(filter_a)
    num_b = len(df[df["Tweet Text"].str.contains(b)])
    num_a_b = len(filter_a['Tweet Text'][filter_a['Tweet Text'].str.contains(b)])
    return total_size * float(num_a_b) / float(num_a * num_b)

# Halfing functions
def half_lift(df):
    i = 0
    while i < len(df.columns):
        j = 0
        while j < i + 1:
            if i == j:
                df[df.columns[j]][df.index[i]] = 0
            else:
                df[df.columns[j]][df.index[i]] = ' ' 
            j += 1
        i += 1
    return df

In [9]:
# Lift between iPhone and Android
phones = ["iphone", "android"]

# create phone matrix 
phone_matrix = pd.DataFrame(columns = phones)
for p in phones:
    phone_matrix = phone_matrix.append(pd.Series(0, index = phone_matrix.columns), ignore_index=True)
phone_matrix['phone'] = phones
phone_matrix = phone_matrix.set_index('phone')

# calculate lift between phones
import copy

df = df.dropna(how='any')
lift_matrix = copy.deepcopy(phone_matrix)

for phone1, series in list(lift_matrix.iterrows()):
    for phone2 in series.index:
        lift_matrix[phone2].loc[phone1] = calc_lift(phone1, phone2)

print('Lift matrix of phones:')
dissimilarity = copy.deepcopy(lift_matrix)
half_lift(lift_matrix)

Lift matrix of phones:


Unnamed: 0_level_0,iphone,android
phone,Unnamed: 1_level_1,Unnamed: 2_level_1
iphone,0.0,1.125
android,,0.0


In [10]:
# phone-attribute matrix dataframe
phone_type = ["iphone", "android"]
phone_att = pd.DataFrame(columns = top_atts)

for p in phone_type:
    phone_att = phone_att.append(pd.Series(0, index = phone_att.columns), ignore_index = True)

phone_att['phone'] = phone_type
phone_att = phone_att.set_index('phone')

In [11]:
# lifts for phones and attributes
for phone, series in list(phone_att.iterrows()):
    for att in series.index:
        if phone == "iphone":
            phone_att[att].loc[phone] = calc_lift_iphone(phone, att)
        if phone == "android":
            phone_att[att].loc[phone] = calc_lift_android("iphone", att)

print('The Top 10 attributes we chose to look at are:')
for att in range(len(top_atts)):
    print('{}. '.format(att + 1) + top_atts[att])
    
print("")
phone_att

The Top 10 attributes we chose to look at are:
1. new
2. 5g
3. mini
4. chip
5. powerful
6. camera
7. design
8. charger
9. display
10. retina



Unnamed: 0_level_0,new,5g,mini,chip,powerful,camera,design,charger,display,retina
phone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
iphone,1.02668,1.0321,1.02919,1.03864,1.03994,1.02874,1.03717,1.02894,1.03515,1.03994
android,1.10911,1.12046,1.1133,1.13185,1.13339,1.10337,1.12006,1.11033,1.13339,1.13339


## Sentiment Analysis

In [17]:
def update_list_all(attribute_list):
    counter = 0
    match_list = []
    for i in tweets_all:
        for att in attribute_list:
            match = re.findall(att + '[?!.,]+', re.escape(str(i)))
            counter += 1
            
            if match not in match_list and match != []:
                match_list.append(match)
                
                for x in match_list:
                    
                    for y in x:
                        attribute_list.append(y)
                        
    return set(attribute_list)

def update_list_rt(attribute_list):
    counter = 0
    match_list = []
    for i in tweets_rt:
        for att in attribute_list:
            match = re.findall(att + '[?!.,]+', re.escape(str(i)))
            counter += 1
            
            if match not in match_list and match != []:
                match_list.append(match)
                
                for x in match_list:
                    
                    for y in x:
                        attribute_list.append(y)
                        
    return set(attribute_list)

def update_list_no_rt(attribute_list):
    counter = 0
    match_list = []
    for i in tweets_no_rt:
        for att in attribute_list:
            match = re.findall(att + '[?!.,]+', re.escape(str(i)))
            counter += 1
            
            if match not in match_list and match != []:
                match_list.append(match)
                
                for x in match_list:
                    
                    for y in x:
                        attribute_list.append(y)
                        
    return set(attribute_list)

In [18]:
#install if necessary
#!pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

lex = SentimentIntensityAnalyzer(lexicon_file="vader_lexicon.txt",
                                 emoji_lexicon="emoji_utf8_lexicon.txt")

In [19]:
tweets_all = df["Tweet Text"]
tweets_rt = df_rt["Tweet Text"]
tweets_no_rt = df_no_rt["Tweet Text"]

In [20]:
def sentiment_analyzer_scores(sentence):
    score = lex.polarity_scores(sentence)
    comp = score["compound"]
    return comp

def att_to_dict(attribute_list):
    att_dict = dict.fromkeys(attribute_list, 0)
    return att_dict

def add_to_dict(att, original_atts, att_dict):
    if att in original_atts:
        att_dict[att] += 1
    return att_dict

def a14(s):
    if 'A14' in s:
        return 1

In [21]:
# call list function to find all attributes with punctuation
att_a = update_list_all(top_atts)
att_r = update_list_rt(top_atts)
att_n = update_list_no_rt(top_atts)

#What's this for?
#print(tweets_all.map(a14).sum())
#print(tweets_rt.map(a14).sum())
#print(tweets_no_rt.map(a14).sum())

In [23]:
# All Tweets
# call list function to find all attributes with punctuation
original_atts = ["new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]

score_list_all = []

att_scores_df_a = pd.DataFrame(columns = original_atts + ['tweet_sent'])


for phrase in tweets_all:
    #create dictionary from original attribute list to keep track of composite scores for each attribute per review
    att_scores = att_to_dict(original_atts)
    att_scores['tweet_sent'] = 0
    att_scores['tweet_sent'] = sentiment_analyzer_scores(phrase)
    
    score_a = 0
    count_a = 0
    att1_count_a = 0
    att2_count_a = 0
    att3_count_a = 0
    att4_count_a = 0
    att5_count_a = 0
    att6_count_a = 0
    att7_count_a = 0
    att8_count_a = 0
    att9_count_a = 0
    att10_count_a = 0
    
    arr_a = str(phrase).replace("'",'').split()
    its_a = [iter(arr_a), iter(arr_a[1:]), iter(arr_a[2:]), iter(arr_a[3:]), iter(arr_a[4:]), iter(arr_a[5:]), iter(arr_a[6:]), iter(arr_a[7:]), iter(arr_a[8:]), iter(arr_a[9:])]
    parse_a = list(zip(its_a[0], its_a[1], its_a[2], its_a[3], its_a[4], its_a[5], its_a[6], its_a[7], its_a[8], its_a[9]))
    
    for i in range(len(parse_a)):
        if i == 0:
            if parse_a[i][0] in att_a:
                score_a += sentiment_analyzer_scores(' '.join(parse_a[i]))
                count_a += 1  
                
                if original_atts[0] in parse_a[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att1_count_a += 1
                if original_atts[1] in parse_a[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att2_count_a += 1
                if original_atts[2] in parse_a[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att3_count_a += 1
                if original_atts[3] in parse_a[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att4_count_a += 1
                if original_atts[4] in parse_a[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att5_count_a += 1
                if original_atts[5] in parse_a[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att6_count_a += 1
                if original_atts[6] in parse_a[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att7_count_a += 1
                if original_atts[7] in parse_a[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att8_count_a += 1
                if original_atts[8] in parse_a[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att9_count_a += 1
                if original_atts[9] in parse_a[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att10_count_a += 1

        if parse_a[i][2] in att_a:
            score_a += sentiment_analyzer_scores(' '.join(parse_a[i]))
            count_a += 1
            
            if original_atts[0] in parse_a[i][0]:
                att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att1_count_a += 1
            if original_atts[1] in parse_a[i][0]:
                att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att2_count_a += 1
            if original_atts[2] in parse_a[i][0]:
                att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att3_count_a += 1
            if original_atts[3] in parse_a[i][0]:
                att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att4_count_a += 1
            if original_atts[4] in parse_a[i][0]:
                att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att5_count_a += 1
            if original_atts[5] in parse_a[i][0]:
                att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att6_count_a += 1
            if original_atts[6] in parse_a[i][0]:
                att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att7_count_a += 1
            if original_atts[7] in parse_a[i][0]:
                att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att8_count_a += 1
            if original_atts[8] in parse_a[i][0]:
                att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att9_count_a += 1
            if original_atts[9] in parse_a[i][0]:
                att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                att10_count_a += 1
                
        if i == len(parse_a)-1:
            if parse_a[i][3] in att_a:
                score_a += sentiment_analyzer_scores(' '.join(parse_a[i]))
                count_a += 1
                
                if original_atts[0] in parse_a[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att1_count_a += 1
                if original_atts[1] in parse_a[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att2_count_a += 1
                if original_atts[2] in parse_a[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att3_count_a += 1
                if original_atts[3] in parse_a[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att4_count_a += 1
                if original_atts[4] in parse_a[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att5_count_a += 1
                if original_atts[5] in parse_a[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att6_count_a += 1
                if original_atts[6] in parse_a[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att7_count_a += 1
                if original_atts[7] in parse_a[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att8_count_a += 1
                if original_atts[8] in parse_a[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att9_count_a += 1
                if original_atts[9] in parse_a[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att10_count_a += 1
                    
            if parse_a[i][4] in att_a:
                score_a += sentiment_analyzer_scores(' '.join(parse_a[i]))
                count_a += 1
                
                if original_atts[0] in parse_a[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att1_count_a += 1
                if original_atts[1] in parse_a[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att2_count_a += 1
                if original_atts[2] in parse_a[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att3_count_a += 1
                if original_atts[3] in parse_a[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att4_count_a += 1
                if original_atts[4] in parse_a[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att5_count_a += 1
                if original_atts[5] in parse_a[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att6_count_a += 1
                if original_atts[6] in parse_a[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att7_count_a += 1
                if original_atts[7] in parse_a[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att8_count_a += 1
                if original_atts[8] in parse_a[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att9_count_a += 1
                if original_atts[9] in parse_a[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_a[i]))
                    att10_count_a += 1
    
    if att1_count_a == 0:
        att_scores[original_atts[0]] = np.nan
    else:
        att_scores[original_atts[0]] = att_scores[original_atts[0]]/att1_count_a
    if att2_count_a == 0:
        att_scores[original_atts[1]] = np.nan
    else:
        att_scores[original_atts[1]] = att_scores[original_atts[1]]/att2_count_a
    if att3_count_a == 0:
        att_scores[original_atts[2]] = np.nan
    else:
        att_scores[original_atts[2]] = att_scores[original_atts[2]]/att3_count_a
    if att4_count_a == 0:
        att_scores[original_atts[3]] = np.nan
    else:
        att_scores[original_atts[3]] = att_scores[original_atts[3]]/att4_count_a
    if att5_count_a == 0:
        att_scores[original_atts[4]] = np.nan
    else:
        att_scores[original_atts[4]] = att_scores[original_atts[4]]/att5_count_a
    if att6_count_a == 0:
        att_scores[original_atts[5]] = np.nan
    else:
        att_scores[original_atts[5]] = att_scores[original_atts[5]]/att6_count_a
    if att7_count_a == 0:
        att_scores[original_atts[6]] = np.nan
    else:
        att_scores[original_atts[6]] = att_scores[original_atts[6]]/att7_count_a
    if att8_count_a == 0:
        att_scores[original_atts[7]] = np.nan
    else:
        att_scores[original_atts[7]] = att_scores[original_atts[7]]/att8_count_a
    if att9_count_a == 0:
        att_scores[original_atts[8]] = np.nan
    else:
        att_scores[original_atts[8]] = att_scores[original_atts[8]]/att9_count_a
    if att10_count_a == 0:
        att_scores[original_atts[9]] = np.nan
    else:
        att_scores[original_atts[9]] = att_scores[original_atts[9]]/att10_count_a
        
    att_scores_df_a = att_scores_df_a.append(att_scores, ignore_index = True)
    
    if count_a == 0:
        score_list_all.append(np.nan)
        
    else:
        score_list_all.append(score_a/count_a)

In [24]:
# All Re-Tweets
# call list function to find all attributes with punctuation
original_atts = ["new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]

score_list_rt = []

att_scores_df_r = pd.DataFrame(columns = original_atts + ['tweet_sent'])


for phrase in tweets_rt:
    #create dictionary from original attribute list to keep track of composite scores for each attribute per review
    att_scores = att_to_dict(original_atts)
    att_scores['tweet_sent'] = 0
    att_scores['tweet_sent'] = sentiment_analyzer_scores(phrase)
    
    score_r = 0
    count_r = 0
    att1_count_r = 0
    att2_count_r = 0
    att3_count_r = 0
    att4_count_r = 0
    att5_count_r = 0
    att6_count_r = 0
    att7_count_r = 0
    att8_count_r = 0
    att9_count_r = 0
    att10_count_r = 0
    
    arr_r = str(phrase).replace("'",'').split()
    its_r = [iter(arr_r), iter(arr_r[1:]), iter(arr_r[2:]), iter(arr_r[3:]), iter(arr_r[4:]), iter(arr_r[5:]), iter(arr_r[6:]), iter(arr_r[7:]), iter(arr_r[8:]), iter(arr_r[9:])]
    parse_r = list(zip(arr_r[0], arr_r[1], arr_r[2], arr_r[3], arr_r[4], arr_r[5], arr_r[6], arr_r[7], arr_r[8], arr_r[9]))
    
    for i in range(len(arr_r)):
        if i == 0:
            if arr_r[i][0] in att_r:
                score_r += sentiment_analyzer_scores(' '.join(parse_r[i]))
                count_r += 1  
                
                if original_atts[0] in parse_r[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att1_count_r += 1
                if original_atts[1] in parse_r[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att2_count_r += 1
                if original_atts[2] in parse_r[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att3_count_r += 1
                if original_atts[3] in parse_r[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att4_count_r += 1
                if original_atts[4] in parse_r[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att5_count_r += 1
                if original_atts[5] in parse_r[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att6_count_r += 1
                if original_atts[6] in parse_r[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att7_count_r += 1
                if original_atts[7] in parse_r[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att8_count_r += 1
                if original_atts[8] in parse_r[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att9_count_r += 1
                if original_atts[9] in parse_r[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att10_count_r += 1
                    
# Getting an error here --> parse_r[i][2] in att 'list index is out of range'
        if parse_r[i][2] in att_r:
            score_r += sentiment_analyzer_scores(' '.join(parse_r[i]))
            count_r += 1
            
            if original_atts[0] in parse_r[i][0]:
                att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att1_count_r += 1
            if original_atts[1] in parse_r[i][0]:
                att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att2_count_r += 1
            if original_atts[2] in parse_r[i][0]:
                att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att3_count_r += 1
            if original_atts[3] in parse_r[i][0]:
                att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att4_count_r += 1
            if original_atts[4] in parse_r[i][0]:
                att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att5_count_r += 1
            if original_atts[5] in parse_r[i][0]:
                att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att6_count_r += 1
            if original_atts[6] in parse_r[i][0]:
                att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att7_count_r += 1
            if original_atts[7] in parse_r[i][0]:
                att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att8_count_r += 1
            if original_atts[8] in parse_r[i][0]:
                att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att9_count_r += 1
            if original_atts[9] in parse_r[i][0]:
                att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                att10_count_r += 1
                
        if i == len(parse_r)-1:
            if parse_r[i][3] in att_r:
                score_r += sentiment_analyzer_scores(' '.join(parse_r[i]))
                count_r += 1
                
                if original_atts[0] in parse_r[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att1_count_r += 1
                if original_atts[1] in parse_r[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att2_count_r += 1
                if original_atts[2] in parse_r[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att3_count_r += 1
                if original_atts[3] in parse_r[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att4_count_r += 1
                if original_atts[4] in parse_r[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att5_count_r += 1
                if original_atts[5] in parse_r[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att6_count_r += 1
                if original_atts[6] in parse_r[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att7_count_r += 1
                if original_atts[7] in parse_r[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att8_count_r += 1
                if original_atts[8] in parse_r[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att9_count_r += 1
                if original_atts[9] in parse_r[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att10_count_r += 1
                    
            if parse_r[i][4] in att_r:
                score_r += sentiment_analyzer_scores(' '.join(parse_r[i]))
                count_r += 1
                
                if original_atts[0] in parse_r[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att1_count_r += 1
                if original_atts[1] in parse_r[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att2_count_r += 1
                if original_atts[2] in parse_r[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att3_count_r += 1
                if original_atts[3] in parse_r[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att4_count_r += 1
                if original_atts[4] in parse_r[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att5_count_r += 1
                if original_atts[5] in parse_r[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att6_count_r += 1
                if original_atts[6] in parse_r[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att7_count_r += 1
                if original_atts[7] in parse_r[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att8_count_r += 1
                if original_atts[8] in parse_r[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att9_count_r += 1
                if original_atts[9] in parse_r[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_r[i]))
                    att10_count_r += 1
    
    if att1_count_r == 0:
        att_scores[original_atts[0]] = np.nan
    else:
        att_scores[original_atts[0]] = att_scores[original_atts[0]]/att1_count_r
    if att2_count_r == 0:
        att_scores[original_atts[1]] = np.nan
    else:
        att_scores[original_atts[1]] = att_scores[original_atts[1]]/att2_count_r
    if att3_count_r == 0:
        att_scores[original_atts[2]] = np.nan
    else:
        att_scores[original_atts[2]] = att_scores[original_atts[2]]/att3_count_r
    if att4_count_r == 0:
        att_scores[original_atts[3]] = np.nan
    else:
        att_scores[original_atts[3]] = att_scores[original_atts[3]]/att4_count_r
    if att5_count_r == 0:
        att_scores[original_atts[4]] = np.nan
    else:
        att_scores[original_atts[4]] = att_scores[original_atts[4]]/att5_count_r
    if att6_count_r == 0:
        att_scores[original_atts[5]] = np.nan
    else:
        att_scores[original_atts[5]] = att_scores[original_atts[5]]/att6_count_r
    if att7_count_r == 0:
        att_scores[original_atts[6]] = np.nan
    else:
        att_scores[original_atts[6]] = att_scores[original_atts[6]]/att7_count_r
    if att8_count_r == 0:
        att_scores[original_atts[7]] = np.nan
    else:
        att_scores[original_atts[7]] = att_scores[original_atts[7]]/att8_count_r
    if att9_count_r == 0:
        att_scores[original_atts[8]] = np.nan
    else:
        att_scores[original_atts[8]] = att_scores[original_atts[8]]/att9_count_r
    if att10_count_r == 0:
        att_scores[original_atts[9]] = np.nan
    else:
        att_scores[original_atts[9]] = att_scores[original_atts[9]]/att10_count_r
        
    att_scores_df_r = att_scores_df_r.append(att_scores, ignore_index = True)
    
    if count_r == 0:
        score_list_rt.append(np.nan)
        
    else:
        score_list_rt.append(score_r/count_r)

IndexError: list index out of range

In [25]:
# No Re-Tweets
# call list function to find all attributes with punctuation
original_atts = ["new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]

score_list_no_rt = []

att_scores_df_n = pd.DataFrame(columns = original_atts + ['tweet_sent'])


for phrase in tweets_no_rt:
    #create dictionary from original attribute list to keep track of composite scores for each attribute per review
    att_scores = att_to_dict(original_atts)
    att_scores['tweet_sent'] = 0
    att_scores['tweet_sent'] = sentiment_analyzer_scores(phrase)
    
    score_n = 0
    count_n = 0
    att1_count_n = 0
    att2_count_n = 0
    att3_count_n = 0
    att4_count_n = 0
    att5_count_n = 0
    att6_count_n = 0
    att7_count_n = 0
    att8_count_n = 0
    att9_count_n = 0
    att10_count_n = 0
    
    arr_n = str(phrase).replace("'",'').split()
    its_n = [iter(arr_n), iter(arr_n[1:]), iter(arr_n[2:]), iter(arr_n[3:]), iter(arr_n[4:]), iter(arr_n[5:]), iter(arr_n[6:]), iter(arr_n[7:]), iter(arr_n[8:]), iter(arr_n[9:])]
    parse_n = list(zip(its_n[0], its_n[1], its_n[2], its_n[3], its_n[4], its_n[5], its_n[6], its_n[7], its_n[8], its_n[9]))
    
    for i in range(len(parse_n)):
        if i == 0:
            if parse_n[i][0] in att_n:
                score_n += sentiment_analyzer_scores(' '.join(parse_n[i]))
                count_n += 1  
                
                if original_atts[0] in parse_n[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att1_count_n += 1
                if original_atts[1] in parse_n[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att2_count_n += 1
                if original_atts[2] in parse_n[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att3_count_n += 1
                if original_atts[3] in parse_n[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att4_count_n += 1
                if original_atts[4] in parse_n[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att5_count_n += 1
                if original_atts[5] in parse_n[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att6_count_n += 1
                if original_atts[6] in parse_n[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att7_count_n += 1
                if original_atts[7] in parse_n[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att8_count_n += 1
                if original_atts[8] in parse_n[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att9_count_n += 1
                if original_atts[9] in parse_n[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att10_count_n += 1

        if parse_n[i][2] in att_n:
            score_n += sentiment_analyzer_scores(' '.join(parse_n[i]))
            count_n += 1
            
            if original_atts[0] in parse_n[i][0]:
                att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att1_count_n += 1
            if original_atts[1] in parse_n[i][0]:
                att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att2_count_n += 1
            if original_atts[2] in parse_n[i][0]:
                att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att3_count_n += 1
            if original_atts[3] in parse_n[i][0]:
                att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att4_count_n += 1
            if original_atts[4] in parse_n[i][0]:
                att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att5_count_n += 1
            if original_atts[5] in parse_n[i][0]:
                att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att6_count_n += 1
            if original_atts[6] in parse_n[i][0]:
                att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att7_count_n += 1
            if original_atts[7] in parse_n[i][0]:
                att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att8_count_n += 1
            if original_atts[8] in parse_n[i][0]:
                att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att9_count_n += 1
            if original_atts[9] in parse_n[i][0]:
                att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                att10_count_n += 1
                
        if i == len(parse_n)-1:
            if parse_n[i][3] in att_n:
                score_n += sentiment_analyzer_scores(' '.join(parse_n[i]))
                count_n += 1
                
                if original_atts[0] in parse_n[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att1_count_n += 1
                if original_atts[1] in parse_n[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att2_count_n += 1
                if original_atts[2] in parse_n[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att3_count_n += 1
                if original_atts[3] in parse_n[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att4_count_n += 1
                if original_atts[4] in parse_n[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att5_count_n += 1
                if original_atts[5] in parse_n[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att6_count_n += 1
                if original_atts[6] in parse_n[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att7_count_n += 1
                if original_atts[7] in parse_n[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att8_count_n += 1
                if original_atts[8] in parse_n[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att9_count_n += 1
                if original_atts[9] in parse_n[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att10_count_n += 1
                    
            if parse_n[i][4] in att_n:
                score_n += sentiment_analyzer_scores(' '.join(parse_n[i]))
                count_n += 1
                
                if original_atts[0] in parse_n[i][0]:
                    att_scores[original_atts[0]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att1_count_n += 1
                if original_atts[1] in parse_n[i][0]:
                    att_scores[original_atts[1]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att2_count_n += 1
                if original_atts[2] in parse_n[i][0]:
                    att_scores[original_atts[2]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att3_count_n += 1
                if original_atts[3] in parse_n[i][0]:
                    att_scores[original_atts[3]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att4_count_n += 1
                if original_atts[4] in parse_n[i][0]:
                    att_scores[original_atts[4]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att5_count_n += 1
                if original_atts[5] in parse_n[i][0]:
                    att_scores[original_atts[5]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att6_count_n += 1
                if original_atts[6] in parse_n[i][0]:
                    att_scores[original_atts[6]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att7_count_n += 1
                if original_atts[7] in parse_n[i][0]:
                    att_scores[original_atts[7]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att8_count_n += 1
                if original_atts[8] in parse_n[i][0]:
                    att_scores[original_atts[8]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att9_count_n += 1
                if original_atts[9] in parse_n[i][0]:
                    att_scores[original_atts[9]] += sentiment_analyzer_scores(' '.join(parse_n[i]))
                    att10_count_n += 1
    
    if att1_count_n == 0:
        att_scores[original_atts[0]] = np.nan
    else:
        att_scores[original_atts[0]] = att_scores[original_atts[0]]/att1_count_n
    if att2_count_n == 0:
        att_scores[original_atts[1]] = np.nan
    else:
        att_scores[original_atts[1]] = att_scores[original_atts[1]]/att2_count_n
    if att3_count_n == 0:
        att_scores[original_atts[2]] = np.nan
    else:
        att_scores[original_atts[2]] = att_scores[original_atts[2]]/att3_count_n
    if att4_count_n == 0:
        att_scores[original_atts[3]] = np.nan
    else:
        att_scores[original_atts[3]] = att_scores[original_atts[3]]/att4_count_n
    if att5_count_n == 0:
        att_scores[original_atts[4]] = np.nan
    else:
        att_scores[original_atts[4]] = att_scores[original_atts[4]]/att5_count_n
    if att6_count_n == 0:
        att_scores[original_atts[5]] = np.nan
    else:
        att_scores[original_atts[5]] = att_scores[original_atts[5]]/att6_count_n
    if att7_count_n == 0:
        att_scores[original_atts[6]] = np.nan
    else:
        att_scores[original_atts[6]] = att_scores[original_atts[6]]/att7_count_n
    if att8_count_n == 0:
        att_scores[original_atts[7]] = np.nan
    else:
        att_scores[original_atts[7]] = att_scores[original_atts[7]]/att8_count_n
    if att9_count_n == 0:
        att_scores[original_atts[8]] = np.nan
    else:
        att_scores[original_atts[8]] = att_scores[original_atts[8]]/att9_count_n
    if att10_count_n == 0:
        att_scores[original_atts[9]] = np.nan
    else:
        att_scores[original_atts[9]] = att_scores[original_atts[9]]/att10_count_n
        
    att_scores_df_n = att_scores_df_n.append(att_scores, ignore_index = True)
    
    if count_n == 0:
        score_list_no_rt.append(np.nan)
        
    else:
        score_list_no_rt.append(score_n/count_n)

In [26]:
# All Tweets
mask1 = (df['Source'] == 'Twitter for iPhone') | (df['Source'] == 'Twitter for Android')
df_all = df[mask1].copy(deep=True)

# turn overall_sent_score into Series to append to beer DF
df_all['avg_att_sent'] = pd.Series(score_list_all)

# merge df with att_scores_df
phone_sent_all = df_all.merge(att_scores_df_a, left_index = True, right_index = True)

In [27]:
# All Re-Tweets
mask2 = (df_rt['Source'] == 'Twitter for iPhone') | (df_rt['Source'] == 'Twitter for Android')
df_rt = df_rt[mask2].copy(deep=True)

# turn overall_sent_score into Series to append to beer DF
df_rt['avg_att_sent'] = pd.Series(score_list_rt)

# merge df with att_scores_df
phone_sent_rt = df_rt.merge(att_scores_df_r, left_index = True, right_index = True)

In [28]:
# No Re-Tweets
mask3 = (df_no_rt['Source'] == 'Twitter for iPhone') | (df_no_rt['Source'] == 'Twitter for Android')
df_no_rt = df_no_rt[mask3].copy(deep=True)

# turn overall_sent_score into Series to append to beer DF
df_no_rt['avg_att_sent'] = pd.Series(score_list_no_rt)

# merge df with att_scores_df
phone_sent_no_rt = df_no_rt.merge(att_scores_df_n, left_index = True, right_index = True)

In [29]:
#phone_sent.avg_att_sent.sort_values(ascending=False)

In [30]:
phone_sent_all[["Source", "tweet_sent", "avg_att_sent", "new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]].groupby(['Source']).mean()

Unnamed: 0_level_0,tweet_sent,avg_att_sent,new,5g,mini,chip,powerful,camera,design,charger,display,retina
Source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Twitter for Android,0.044663,0.006271,,,,,,,,,,
Twitter for iPhone,0.065792,-0.013207,,,,,,,,,,


In [31]:
phone_sent_rt[["Source", "tweet_sent", "avg_att_sent", "new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]].groupby(['Source']).mean()

Unnamed: 0_level_0,avg_att_sent
Source,Unnamed: 1_level_1


In [32]:
phone_sent_no_rt[["Source", "tweet_sent", "avg_att_sent", "new", "5g", "mini", "chip", "powerful", "camera", "design", "charger", "display", "retina"]].groupby(['Source']).mean()

Unnamed: 0_level_0,tweet_sent,avg_att_sent,new,5g,mini,chip,powerful,camera,design,charger,display,retina
Source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Twitter for Android,0.142783,0.072038,0.031683,,,,,,,,,
Twitter for iPhone,0.111211,0.056984,0.020011,0.5574,0.0,0.4588,,,,,0.0,0.0
