# Sentiment Analysis Demo
Using unsupervised methods to train a supervised model on a social media dataset

In [1]:
#import libraries
import pandas as pd
import textblob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import re

In [3]:
#upload sample social media data
#downloaded from Kaggle
sd = pd.read_csv(r'/Users/amberbenbow/Downloads/sentimentdataset.csv')
sd

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


In [5]:
#identify any null values in the dataset
#view all columns names
sd.isnull().sum()

Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp       0
User            0
Platform        0
Hashtags        0
Retweets        0
Likes           0
Country         0
Year            0
Month           0
Day             0
Hour            0
dtype: int64

In [7]:
#drop unneeded columns
sd = sd.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])
sd.head()

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


## Unsupervised Sentiment Analysis

Subjective and opinionated texts are where people express strong feelings and emotions. 

This might make it a classic case where the text documents are a good candidate for extracting sentiment as a feature.

TextBlob is an open-source library for performing NLP tasks, including sentiment analysis. It also an a sentiment lexicon (in the form of an XML file) which it leverages to give both polarity and subjectivity scores. 

- The polarity score is a float within the range [-1.0, 1.0]. 
- The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

This library allows us to look at data which does not have any training labels to determine what might be positive or negative. The sentiment lexicon uses a list of phrases or words that are closely associated with emotions in order to assign a sentiment to the text.

In [9]:
#example 1
print('Example 1: ', textblob.TextBlob('This is an AMAZING pair of Jeans!').sentiment)

#example 2
print('Example 2: ', textblob.TextBlob('I really hated this UGLY T-shirt!!').sentiment)

Example 1:  Sentiment(polarity=0.7500000000000001, subjectivity=0.9)
Example 2:  Sentiment(polarity=-0.95, subjectivity=0.85)


In [11]:
#use lexicon on social media text
sd_snt_obj = sd['Text'].apply(lambda row: textblob.TextBlob(row).sentiment)

#create a polarity column based on lexicon values
sd['Polarity'] = [obj.polarity for obj in sd_snt_obj.values]

#create a subjectivity column based on lexicon values
sd['Subjectivity'] = [obj.subjectivity for obj in sd_snt_obj.values]

In [13]:
#view new columns with additional features
sd.head()

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour,Polarity,Subjectivity
0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12,0.75,0.8
1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8,-1.0,1.0
2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15,0.75,0.9
3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18,0.46875,0.75
4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19,0.136364,0.454545


## Tops and Tails
View most negative and positive posts in the sample dataset

In [16]:
#find most positive post index
pos_indx = sd['Polarity'].idxmax()
print('\nMost positive post is row ', pos_indx,'\n')

#read post
post = sd.iloc[pos_indx]['Text']
print('\nPost reads: \n', post,'\n')


Most positive post is row  57 


Post reads: 
  Laughter is the best medicine—enjoying a comedy show.  



In [18]:
#find most negative post index
neg_indx = sd['Polarity'].idxmin()
print('\nMost negative post is row ', neg_indx,'\n')

#read post
post = sd.iloc[neg_indx]['Text']
print('\nPost reads: \n', post,'\n')


Most negative post is row  1 


Post reads: 
  Traffic was terrible this morning.                  



## Supervised Learning
Is a method that requires labeled training data in order to be effective. Since we used the lexicon to assign polarity to our social media dataset, we will use this polarity to train a supervised model on our social media posts.

In [21]:
#sklearn library imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#text and continuous sentiment labels
text = sd['Text']
label = sd['Polarity']

In [23]:
#split training and testing sets
sd_features_train, sd_features_test, sd_class_train, sd_class_test = train_test_split(text, label, test_size=0.3)

In [25]:
#vectorize the text data for the model to ingest
vectorizer = TfidfVectorizer()
sd_features_train = vectorizer.fit_transform(sd_features_train)
sd_features_test = vectorizer.transform(sd_features_test)

In [27]:
#train a linear regression model
model = LinearRegression()
model.fit(sd_features_train, sd_class_train)

#use model to make predictions
y_pred = model.predict(sd_features_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(sd_class_test, y_pred)
print("Mean Squared Error:", mse)

In [29]:
#evaluate the model
mse = mean_squared_error(sd_class_test, y_pred)

#the lower the MSE the closer the model is to actual results
print("Mean Squared Error:",round(mse,3))

Mean Squared Error: 0.071
