# OVERVIEW

In [144]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
# from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")


In [145]:
#load the dataset
df =pd.read_csv(r'C:\Users\PC\Documents\moringa\Phase 4 project\judge-1377884607_tweet_product_company.csv', encoding='latin1')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [146]:
#check the number of rows and columns
df.shape

(9093, 3)

In [147]:
# Length of data
print('length of data is', len(df))

length of data is 9093


In [148]:
#review column names
column_names = df.columns.tolist()
print("Column names:")
for col in column_names:
    print(f"- {col}")

Column names:
- tweet_text
- emotion_in_tweet_is_directed_at
- is_there_an_emotion_directed_at_a_brand_or_product


In [149]:
# Rename the columns
df.rename(columns={
    'tweet_text': 'tweet',
    'emotion_in_tweet_is_directed_at': 'product',
    'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'
}, inplace=True)

# Check the updated df
df.head(10)

Unnamed: 0,tweet,product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


In [150]:
# Keep only rows where the sentiment is either 'Positive emotion' or 'Negative emotion'
df_filtered = df[df['sentiment'].isin(['Positive emotion', 'Negative emotion'])]

# Print the filtered DataFrame
print(df_filtered)


                                                  tweet  \
0     .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1     @jessedee Know about @fludapp ? Awesome iPad/i...   
2     @swonderlin Can not wait for #iPad 2 also. The...   
3     @sxsw I hope this year's festival isn't as cra...   
4     @sxtxstate great stuff on Fri #SXSW: Marissa M...   
...                                                 ...   
9077  @mention your PR guy just convinced me to swit...   
9079  &quot;papyrus...sort of like the ipad&quot; - ...   
9080  Diller says Google TV &quot;might be run over ...   
9085  I've always used Camera+ for my iPhone b/c it ...   
9088                      Ipad everywhere. #SXSW {link}   

                              product         sentiment  
0                              iPhone  Negative emotion  
1                  iPad or iPhone App  Positive emotion  
2                                iPad  Positive emotion  
3                  iPad or iPhone App  Negative emotion  
4

In [151]:
# Check unique values in the 'sentiment' column
unique_sentiments = df_filtered['sentiment'].unique()

# Display the unique sentiments
print(unique_sentiments)

['Negative emotion' 'Positive emotion']


In [152]:
#Get summary statistics
df_filtered.describe()

Unnamed: 0,tweet,product,sentiment
count,3548,3191,3548
unique,3539,9,2
top,RT @mention Marissa Mayer: Google Will Connect...,iPad,Positive emotion
freq,3,918,2978


In [153]:
product_counts = df['product'].value_counts()
print("\nProduct counts:")
print(product_counts)


Product counts:
product
iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: count, dtype: int64


In [154]:
# check missing values
df_filtered.isnull().sum()  

tweet          0
product      357
sentiment      0
dtype: int64

The dataframe has **9093 rows and 3 columns**
 
All the data is of the object type which normally represents text or categorical data

The columns are named:
**tweet**
    *-contains 9092 non-null entries of text data and has 1 missing value*
    *-The top Tweet is a retweeted 5 times*

**product**
    *- Represents the emotion attached to a specific brand or product*
    *- There are 3291 non-null entries and has 5802 missing values*
    *- The most frequent value is “iPad” (appears 946 times)*

**sentiment**
    *-Indicates whether there is an emotion directed at a brand or product*
    *-Four possible values:*
        *“Positive emotion”, “Negative emotion”,“No emotion toward brand or product” (appears 5389 times),“I can’t tell”*
    *- The most common emotion is "No emotion toward brand or product"*

In [155]:
# Fill missing values in the 'product' column with the mode
mode_value = df_filtered['product'].mode()[0]  # Calculate the mode value

# Create a copy of the DataFrame
df_filtered_copy = df_filtered.copy()

# Fill missing values in the 'product' column of the copied DataFrame
df_filtered_copy['product'].fillna(mode_value, inplace=True)

# Check missing values after handling
print(df_filtered_copy.isnull().sum())

tweet        0
product      0
sentiment    0
dtype: int64


In [None]:
import pandas as pd

# Assuming df is your DataFrame with 'tweet' and 'product' columns
# Replace 'tweet' with the name of your tweet column and 'product' with the name of your product column

# List of keywords related to Apple and Google
apple_keywords = ['apple', 'iphone', 'ipad', 'mac', 'ios','itunes']
google_keywords = ['google', 'android', 'pixel', 'chromebook','android','samsung']

# Function to check if a tweet contains keywords related to Apple or Google
def check_product(tweet):
    if isinstance(tweet, str):
        tweet_lower = tweet.lower()
        if any(keyword in tweet_lower for keyword in apple_keywords):
            return 'Apple'
        elif any(keyword in tweet_lower for keyword in google_keywords):
            return 'Google'
    return None

# Apply the function to each row in the DataFrame
# Apply the function to each row in the DataFrame
df_filtered_copy['product'] = df_filtered_copy['product'].apply(check_product)

# Print the DataFrame to see the updated 'product' column
df_filtered_copy.isna().sum()


In [156]:
df_filtered_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3548 entries, 0 to 9088
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      3548 non-null   object
 1   product    3548 non-null   object
 2   sentiment  3548 non-null   object
dtypes: object(3)
memory usage: 110.9+ KB


In [157]:
df_filtered_copy.tail()

Unnamed: 0,tweet,product,sentiment
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9080,Diller says Google TV &quot;might be run over ...,Other Google product or service,Negative emotion
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion


In [158]:
# Save the cleaned DataFrame to a CSV file
df.to_csv('cleaned_judge-1377884607_tweet_product_company.csv', index=False)

In [159]:
# Define mapping for sentiment
sentiment_mapping = {'Negative emotion': 0, 'Positive emotion': 1}

# Convert sentiment to binary using mapping
df_filtered_copy['sentiment'] = df_filtered_copy['sentiment'].map(sentiment_mapping)

# Display the DataFrame with the updated sentiment column
print(df_filtered_copy)

                                                  tweet  \
0     .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1     @jessedee Know about @fludapp ? Awesome iPad/i...   
2     @swonderlin Can not wait for #iPad 2 also. The...   
3     @sxsw I hope this year's festival isn't as cra...   
4     @sxtxstate great stuff on Fri #SXSW: Marissa M...   
...                                                 ...   
9077  @mention your PR guy just convinced me to swit...   
9079  &quot;papyrus...sort of like the ipad&quot; - ...   
9080  Diller says Google TV &quot;might be run over ...   
9085  I've always used Camera+ for my iPhone b/c it ...   
9088                      Ipad everywhere. #SXSW {link}   

                              product  sentiment  
0                              iPhone          0  
1                  iPad or iPhone App          1  
2                                iPad          1  
3                  iPad or iPhone App          0  
4                              Googl

# Data preparation

In [160]:
#### Making statement text in lower case
df_filtered_copy['tweet']=df_filtered_copy['tweet'].str.lower()


In [161]:
df_filtered_copy['tweet'].tail()

9077    @mention your pr guy just convinced me to swit...
9079    &quot;papyrus...sort of like the ipad&quot; - ...
9080    diller says google tv &quot;might be run over ...
9085    i've always used camera+ for my iphone b/c it ...
9088                        ipad everywhere. #sxsw {link}
Name: tweet, dtype: object

In [162]:
#### Cleaning and removing Stop words of english
stopwords_list = stopwords.words('english')

In [163]:
# Get the list of stopwords in English
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [164]:
#### Cleaning and removing the above stop words list from the tweet text
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(tweet):
    return " ".join([word for word in str(tweet).split() if word not in STOPWORDS])
df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(lambda tweet: cleaning_stopwords(tweet))
df_filtered_copy['tweet'].head()

0    .@wesley83 3g iphone. 3 hrs tweeting #rise_aus...
1    @jessedee know @fludapp ? awesome ipad/iphone ...
2           @swonderlin wait #ipad 2 also. sale #sxsw.
3    @sxsw hope year's festival crashy year's iphon...
4    @sxtxstate great stuff fri #sxsw: marissa maye...
Name: tweet, dtype: object

In [165]:
#### Cleaning and removing punctuations 
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(tweet):
    translator = str.maketrans('', '', punctuations_list)
    return tweet.translate(translator)

In [166]:
df_filtered_copy['tweet']= df_filtered_copy['tweet'].apply(lambda x: cleaning_punctuations(x))
df_filtered_copy['tweet'].tail()

9077    mention pr guy convinced switch back iphone gr...
9079    quotpapyrussort like ipadquot  nice lol sxsw l...
9080    diller says google tv quotmight run playstatio...
9085    ive always used camera iphone bc image stabili...
9088                            ipad everywhere sxsw link
Name: tweet, dtype: object

In [167]:
#### Cleaning and removing repeating characters
def cleaning_repeating_char(tweet):
    return re.sub(r'(.)\1+', r'\1', tweet)

In [168]:
df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(lambda x: cleaning_repeating_char(x))
df_filtered_copy['tweet'].tail()

9077    mention pr guy convinced switch back iphone gr...
9079    quotpapyrusort like ipadquot nice lol sxsw lavele
9080    diler says gogle tv quotmight run playstation ...
9085    ive always used camera iphone bc image stabili...
9088                            ipad everywhere sxsw link
Name: tweet, dtype: object

In [169]:
#### Cleaning and removing email
def cleaning_email(df_filtered_copy):
    return re.sub('@[^\s]+', ' ', df_filtered_copy)

In [171]:
# Apply the cleaning function to the 'tweet' column of df_filtered_copy
df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(lambda x: cleaning_email(x))

# Display the tail of the 'tweet' column of df_filtered_copy
print(df_filtered_copy['tweet'].tail())

9077    mention pr guy convinced switch back iphone gr...
9079    quotpapyrusort like ipadquot nice lol sxsw lavele
9080    diler says gogle tv quotmight run playstation ...
9085    ive always used camera iphone bc image stabili...
9088                            ipad everywhere sxsw link
Name: tweet, dtype: object


In [172]:
#### Cleaning and removing URL's
def cleaning_URLs(df_filtered_copy):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',df_filtered_copy)

In [173]:
df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(lambda x: cleaning_URLs(x))
df_filtered_copy['tweet'].tail()

9077    mention pr guy convinced switch back iphone gr...
9079    quotpapyrusort like ipadquot nice lol sxsw lavele
9080    diler says gogle tv quotmight run playstation ...
9085    ive always used camera iphone bc image stabili...
9088                            ipad everywhere sxsw link
Name: tweet, dtype: object

In [174]:
#### Cleaning and removing Numeric numbers
def cleaning_numbers(df_filtered_copy):
    return re.sub('[0-9]+', '', df_filtered_copy)

In [176]:
df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(lambda x: cleaning_numbers(x))
df_filtered_copy['tweet'].tail()

9077    mention pr guy convinced switch back iphone gr...
9079    quotpapyrusort like ipadquot nice lol sxsw lavele
9080    diler says gogle tv quotmight run playstation ...
9085    ive always used camera iphone bc image stabili...
9088                            ipad everywhere sxsw link
Name: tweet, dtype: object

In [177]:
#### Getting tokenization of tweet text
tokenizer = RegexpTokenizer(r'\w+')
df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(tokenizer.tokenize)

In [178]:
df_filtered_copy['tweet'].head()

0    [wesley, g, iphone, hrs, tweting, riseaustin, ...
1    [jesede, know, fludap, awesome, ipadiphone, ap...
2           [swonderlin, wait, ipad, also, sale, sxsw]
3    [sxsw, hope, years, festival, crashy, years, i...
4    [sxtxstate, great, stuf, fri, sxsw, marisa, ma...
Name: tweet, dtype: object

In [179]:
#### Applying Stemming
st = nltk.PorterStemmer()
def stemming_on_tweet(df_filtered_copy):
    tweet = [st.stem(word) for word in df_filtered_copy]
    return df_filtered_copy

df_filtered_copy['tweet']= df_filtered_copy['tweet'].apply(lambda x: stemming_on_tweet(x))

In [180]:
df_filtered_copy['tweet'].head()

0    [wesley, g, iphone, hrs, tweting, riseaustin, ...
1    [jesede, know, fludap, awesome, ipadiphone, ap...
2           [swonderlin, wait, ipad, also, sale, sxsw]
3    [sxsw, hope, years, festival, crashy, years, i...
4    [sxtxstate, great, stuf, fri, sxsw, marisa, ma...
Name: tweet, dtype: object

In [183]:
#### Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_tweet(df_filtered_copy):
    tweet = [lm.lemmatize(word) for word in df_filtered_copy]
    return df_filtered_copy

df_filtered_copy['tweet'] = df_filtered_copy['tweet'].apply(lambda x: lemmatizer_on_tweet(x))

In [184]:
df_filtered_copy['tweet'].head()

0    [wesley, g, iphone, hrs, tweting, riseaustin, ...
1    [jesede, know, fludap, awesome, ipadiphone, ap...
2           [swonderlin, wait, ipad, also, sale, sxsw]
3    [sxsw, hope, years, festival, crashy, years, i...
4    [sxtxstate, great, stuf, fri, sxsw, marisa, ma...
Name: tweet, dtype: object