<a href="https://colab.research.google.com/github/moeenkhurram/Sentiment-analysis-on-Twitter-data/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
import os
import re
import string
import pandas as pd
DATASET_ENCODING = "ISO-8859-1"
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud 


nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')

# Global Parameters
stop_words = set(stopwords.words('english'))

pd.set_option('display.max_colwidth', 150)



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# About Dataset


Dataset: [Sentiment140 dataset with 1.6 million tweets](https://www.kaggle.com/kazanova/sentiment140)

This is the sentiment140 dataset.
It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment .
It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
The official link regarding the dataset with resources about how it was generated is here
The official paper detailing the approach is here

According to the creators of the dataset:

"Our approach was unique because our training data was automatically created, as opposed to having humans manual annotate tweets. In our approach, we assume that any tweet with positive emoticons, like :), were positive, and tweets with negative emoticons, like :(, were negative. We used the Twitter Search API to collect these tweets by using keyword search"

citation: Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.

In [None]:
colnames=['target','ids' ,'date','flag','user','text']
df = pd.read_csv("/content/gdrive/My Drive/archive.zip",encoding=DATASET_ENCODING, names=colnames, header=None)
df = df.sample(int(len(df)/2)).reset_index(drop=True)
df.shape

In [None]:
df= df.iloc[:,[0,-1]]
df.columns = ['sentiment','tweet']
#df = pd.concat([df.query("sentiment==0").sample(20000,  random_state=7),df.query("sentiment==4").sample(20000, random_state=7)])
df.sentiment = df.sentiment.map({0:0, 4:1})
df

In [None]:
def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove retweets:
    tweet = re.sub(r'RT : ', '', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    #ps = PorterStemmer()
    #stemmed_words = [ps.stem(w) for w in filtered_words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in filtered_words]
    
    return " ".join(lemma_words)

In [None]:
df['tweet'] = df['tweet'].apply(preprocess_tweet_text)
df

In [None]:
# Creating a word cloud
words = ' '.join([tweet for tweet in df['tweet']])
wordCloud = WordCloud(width=1200, height=800).generate(words)

plt.imshow(wordCloud)
plt.show()

# TextBlob

In [None]:
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

blobber = Blobber(analyzer=NaiveBayesAnalyzer())

blob = blobber("The movie was good, I do not want to watch it agian")
print(blob.sentiment)

In [None]:
df1 = df.copy()

In [None]:
def getTextSubjectivity(txt):
    return TextBlob(txt).sentiment.subjectivity

def getTextPolarity(txt):
    return TextBlob(txt).sentiment.polarity


In [None]:
df1['Subjectivity'] =     df1['tweet'].apply(getTextSubjectivity)
df1['Polarity']     =     df1['tweet'].apply(getTextPolarity)

In [None]:
# negative, nautral, positive analysis 
def Sentiments_Score(tweet):
    if tweet < 0:
        return "Negative"
    elif tweet == 0:
        return "Neutral"
    else:
        return "Positive"    

df1['Predicition_Textblob'] = df1['Polarity'].apply(Sentiments_Score)

In [None]:
df1.sample(9)

In [None]:
plt.bar(df1.groupby('Predicition_Textblob').count().index.values, df1.groupby('Predicition_Textblob').size().values) 

## Vader Sentiment

In [None]:

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
df2= df.copy()

In [None]:
df2

In [None]:
df2['Predicition_Vader']=''

def Vader_Sentiment(df2):
    df2["neg"] = df2["tweet"].apply(lambda x:analyser.polarity_scores(x)["neg"])
    df2['neu'] = df2['tweet'].apply(lambda x:analyser.polarity_scores(x)['neu'])
    df2['pos'] = df2['tweet'].apply(lambda x:analyser.polarity_scores(x)['pos'])
    df2['compound'] = df2['tweet'].apply(lambda x:analyser.polarity_scores(x)['compound'])
    
    # negative, nautral, positive analysis 
    df2.loc[df2.compound>0,'Predicition_Vader']='Positive'
    df2.loc[df2.compound==0,'Predicition_Vader']='Neutral'
    df2.loc[df2.compound<0,'Predicition_Vader']='Negative'

    return df2

In [None]:
df2 = Vader_Sentiment(df2)

In [None]:
df2

In [None]:
plt.bar(df2.groupby('Predicition_Vader').count().index.values,  df2.groupby('Predicition_Vader').size().values)

In [None]:
df3=df.copy()

In [None]:
df3

In [None]:
# 0: Negative
# 1: Positive

print(df3.tweet[100])
print(df3.sentiment[100])


In [None]:
training_size = int(len(df3.tweet) * 0.8)

X_train = df3.tweet[0: training_size].values
X_test = df3.tweet[: training_size].values

y_train = df3.sentiment[0: training_size].values
y_test = df3.sentiment[: training_size].values

# Put labels into list to use later:

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer(oov_token="<OOV>")
tokenizer_obj.fit_on_texts(X_train) 

vocab_size = len(tokenizer_obj.word_index) + 1
max_length = 100

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post', truncating='post')

X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post', truncating='post')

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential()
embedding_dim = 16
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim , input_length=max_length))
model.add(tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(embedding_dim, return_sequences=True)))
model.add(tf.keras.layers.Dense(12, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'])
print(model.summary())

In [None]:
from keras import backend as K 
K.clear_session()

In [None]:

num_epochs=5
history = model.fit(X_train_pad, y_train, 
                    epochs = num_epochs,
                    batch_size=256,
                    validation_data=(X_test_pad, y_test),
                    callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, mode='auto')
)

In [None]:
# Decrease because the early stopping

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.xlabel('Epochs')
plt.ylabel('Accuracy')



In [None]:
plt.plot(history.history['loss'])

plt.xlabel('Epochs')
plt.ylabel('loss')

Testing

In [None]:
# Use the model to predict Tweeets  

Test_Tweet =  ['I love this phone']

print(Test_Tweet) 

# Create the sequences

Test_Tweet_sequences = tokenizer_obj.texts_to_sequences(fake_reviews)
Test_Tweet_padded = pad_sequences(Test_Tweet_sequences, maxlen=max_length, padding="post" )           

prediciton = model.predict(Test_Tweet_padded, batch_size=3)

In [None]:
if(np.argmax(prediciton) == 0):
    print("negative")
elif (np.argmax(prediciton) == 1):
    print("positive")

In [None]:
np.round(np.argmax(prediciton), decimals=2)