In [1]:
# Import required libraries

# Data loading and manipulation
import pandas as pd
import numpy as np

# Text preprocessing and NLP
import nltk
import re 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)



# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.colors as pc
from IPython.display import Image, display
from PIL import Image, ImageDraw, ImageFont

# Machine learning and preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB


# Model evaluation metrics
from sklearn.metrics import classification_report,accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score



# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
sns.set(style="whitegrid")
sns.set_theme(rc={'figure.figsize':(11.7,8.27)})

### Data inspection

In [2]:
Movie_Lens=pd.read_csv('/Users/farhiyajarso/Desktop/Sentiment-Analysis-NLP/Data/judge-1377884607_tweet_product_company.csv',
                       encoding='ISO-8859-1') 
#'ISO-8859-1'-ensures no decoding error is thrown since dataset contains 
# non-UTF-8 characters commonly found on social media texts


In [3]:
#DISPLAY FIRST FEW ROWS
Movie_Lens.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
#drop duplicates
Movie_Lens.drop_duplicates(inplace=True)

#recheck if dropped
Movie_Lens.duplicated().sum()


0

In [5]:
# remove missing values for `tweet_text`
Movie_Lens.dropna(subset=['tweet_text'],inplace=True)


In [6]:
#confirm if dropped
Movie_Lens['tweet_text'].isna().sum()

0

In [7]:
def categorize_emotion(row):
    if row['is_there_an_emotion_directed_at_a_brand_or_product'] == 'No emotion toward brand or product':
        return 'neutral'
    elif row['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Positive emotion':
        return 'positive'
    elif row['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Negative emotion':
        return 'negative'
    else:
        return 'neutral'
    
Movie_Lens['emotion'] = Movie_Lens.apply(categorize_emotion, axis=1)

In [8]:
Movie_Lens['cleaned_text'] = Movie_Lens['tweet_text'].str.lower().str.strip()
Movie_Lens.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,emotion,cleaned_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,negative,.@wesley83 i have a 3g iphone. after 3 hrs twe...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,positive,@jessedee know about @fludapp ? awesome ipad/i...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,positive,@swonderlin can not wait for #ipad 2 also. the...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,negative,@sxsw i hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,positive,@sxtxstate great stuff on fri #sxsw: marissa m...


In [9]:
Movie_Lens['cleaned_text'] = Movie_Lens['cleaned_text'].str.replace(r'http\S+|www\S+|@\w+|#\w+', '', regex=True)
Movie_Lens.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,emotion,cleaned_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,negative,. i have a 3g iphone. after 3 hrs tweeting at ...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,positive,know about ? awesome ipad/iphone app that yo...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,positive,can not wait for 2 also. they should sale th...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,negative,i hope this year's festival isn't as crashy a...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,positive,"great stuff on fri : marissa mayer (google), ..."


In [10]:
Movie_Lens['cleaned_text'].tail()

9088                             ipad everywhere.  {link}
9089    wave, buzz... rt  we interrupt your regularly ...
9090    google's zeiger, a physician never reported po...
9091    some verizon iphone customers complained their...
9092    ï¡ïàü_êîò£áââ_£â_ûârt  ...
Name: cleaned_text, dtype: object

EXPLORATORY DATA ANALYSIS
To better understand our text data, we explore the following questions:

1.Product Engagement:
Which products receive the highest number of reviews or customer interactions?

2.Target Class Distribution:
How are sentiment classes distributed across the dataset, and is there any class imbalance?

3.Text Length vs Sentiment:
Does the number of words or the length of a tweet influence whether the sentiment is positive or negative? In other words, are positive or negative tweets more expressive?

4.Sentiment Drivers (Vocabulary):
What distinguishes positive tweets from negative ones in terms of word usage? Which words are most common in each sentiment class?

5.Feature–Target Relationships:
What relationships exist between extracted text features and the target variable, particularly in terms of correlation and linearity?

1.What products have the most reviews / customer interaction?