In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, TweetTokenizer
from nltk.stem import WordNetLemmatizer

import re

In [None]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, silhouette_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

In [47]:
# import data
df = pd.read_csv("../data/brands_and_product_tweets.csv")
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [48]:
# rename columns
df = df.rename(columns = {'tweet_text': 'tweet',
                          'emotion_in_tweet_is_directed_at': 'product',
                          'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'})

In [49]:
# rename sentiment labels
df['sentiment'] = df['sentiment'].replace({"No emotion toward brand or product": "neutral",
                                           "Positive emotion": "positive",
                                           "Negative emotion": "negative",
                                           "I can't tell": "unclear"})

In [50]:
df['sentiment'].value_counts()

neutral     5156
positive    2869
negative     545
unclear      151
Name: sentiment, dtype: int64

In [51]:
df['product'].value_counts()

iPad                               910
Apple                              640
iPad or iPhone App                 451
Google                             412
iPhone                             288
Other Google product or service    282
Android App                         78
Android                             74
Other Apple product or service      34
Name: product, dtype: int64

In [62]:
df.isna().sum()

tweet        0
product      0
sentiment    0
dtype: int64

In [53]:
df[df['tweet'].isna()]

Unnamed: 0,tweet,product,sentiment
6,,,neutral


In [54]:
df.dropna(subset=['tweet'], inplace=True)

In [55]:
df[df['product'].isna()]

Unnamed: 0,tweet,product,sentiment
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral
16,Holler Gram for iPad on the iTunes App Store -...,,neutral
32,"Attn: All #SXSW frineds, @mention Register fo...",,neutral
33,Anyone at #sxsw want to sell their old iPad?,,neutral
34,Anyone at #SXSW who bought the new iPad want ...,,neutral
...,...,...,...
8715,"@mention Yup, but I don't have a third app yet...",,neutral
8717,"Wave, buzz... RT @mention We interrupt your re...",,neutral
8718,"Google's Zeiger, a physician never reported po...",,neutral
8719,Some Verizon iPhone customers complained their...,,neutral


In [56]:
df['product'].fillna('undefined', inplace = True)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8698 entries, 0 to 8720
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      8698 non-null   object
 1   product    8698 non-null   object
 2   sentiment  8698 non-null   object
dtypes: object(3)
memory usage: 271.8+ KB


In [64]:
df[df.duplicated()]

Unnamed: 0,tweet,product,sentiment


In [59]:
df.drop_duplicates(inplace=True)

In [65]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,positive
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,negative
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive
...,...,...,...
8693,Ipad everywhere. #SXSW {link},iPad,positive
8694,"Wave, buzz... RT @mention We interrupt your re...",undefined,neutral
8695,"Google's Zeiger, a physician never reported po...",undefined,neutral
8696,Some Verizon iPhone customers complained their...,undefined,neutral


In [None]:
def clean_text(text):

    # check if input text is a string
    if not isinstance(text, str):
        return 'not a string'
    
    # remove URLs items
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
   
    # remove user @ references and '#' 
    text = re.sub(r'\@\w+|\#','', text)
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove special characters and numbers
    text = re.sub(r"[^a-zA-Z']", ' ', text)
    
    # convert to lowercase
    text = text.lower()
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text


In [None]:
# clean tweet column
df['tweet_clean'] = df['tweet'].apply(clean_text)

In [None]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize each word
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Rejoin lemmatized tokens into a string
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Example text
example_text = "Fall Out Boy Rules."

# Lemmatize the example text
lemmatized_text = lemmatize_text(example_text)
print(lemmatized_text)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.histplot(df["product"])
plt.xticks(rotation = 75)
plt.show()

In [None]:
def product_brand(product, tweet):

    brand = 'undefined'

    if ((product.__contains__('google')) or (product.__contains__('android'))):
        brand = 'google'

    elif ((product.__contains__('apple')) or (product.__contains__('ip'))): 
        brand = 'apple'
    
    if (brand == 'undetermined'): 
        is_google = (lower_tweet.__contains__('google')) or (lower_tweet.__contains__('android')) 
        is_apple = (lower_tweet.__contains__('apple')) or (lower_tweet.__contains__('ip')) 
        
        if (is_google and is_apple):
            brand = 'Both' 
        elif (is_google):
            brand = 'Google' #Labeling brand as Google
        elif (is_apple):
            brand = 'Apple' #Labeling brand as Apple
    
    return brand

df['Brand'] = df.apply(lambda x: find_brand(x['Product'], x['Tweet']), axis = 1) #Applying function to column
df['Brand'].value_counts() #Reviewing value counts of each class within brand