In [19]:
# variables that contain the user credentials to access the twitter api


api_key = "xxxx"                                 #consumer key
api_key_secret = "xxxx" #consumer key secret
bearer_token = "xxxx"
access_token = "xxxx"
access_token_secret = "xxxx"

In [20]:
import tweepy
from tweepy import API 
from tweepy import Cursor
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from textblob import TextBlob
import re                           
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

In [33]:
# # # # TWITTER CLIENT # # # #
class TwitterClient():
    def __init__(self, twitter_user=None):
        self.auth = TwitterAuthenticator().authenticate_twitter_app()
        self.twitter_client = API(self.auth)

        self.twitter_user = twitter_user
        
        
    def get_twitter_client_API(self):                   # Adding a new function here to allow us to easily return the twitter client
        return self.twitter_client
    

    def get_user_timeline_tweets(self, num_tweets):
        tweets = []
        for tweet in Cursor(self.twitter_client.user_timeline, id=self.twitter_user).items(num_tweets):
            tweets.append(tweet)
        return tweets

    def get_friend_list(self, num_friends):
        friend_list = []
        for friend in Cursor(self.twitter_client.friends, id=self.twitter_user).items(num_friends):
            friend_list.append(friend)
        return friend_list

    def get_home_timeline_tweets(self, num_tweets):
        home_timeline_tweets = []
        for tweet in Cursor(self.twitter_client.home_timeline, id=self.twitter_user).items(num_tweets):
            home_timeline_tweets.append(tweet)
        return home_timeline_tweets


# # # # TWITTER AUTHENTICATER # # # #
class TwitterAuthenticator():

    def authenticate_twitter_app(self):
        auth = OAuthHandler(api_key, api_key_secret)
        auth.set_access_token(access_token, access_token_secret)
        return auth

# # # # TWITTER STREAMER # # # #
class TwitterStreamer():
    """
    Class for streaming and processing live tweets.
    """
    def __init__(self):
        self.twitter_autenticator = TwitterAuthenticator()    

    def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
        # This handles Twitter authetification and the connection to Twitter Streaming API
        listener = TwitterListener(fetched_tweets_filename)
        auth = self.twitter_autenticator.authenticate_twitter_app() 
        stream = Stream(auth, listener)

        # This line filter Twitter Streams to capture data by the keywords: 
        stream.filter(track=hash_tag_list)


# # # # TWITTER STREAM LISTENER # # # #
class TwitterListener(StreamListener):
    """
    This is a basic listener that just prints received tweets to stdout.
    """
    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            print(data)
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True
          
    def on_error(self, status):
        if status == 420:
            # Returning False on_data method in case rate limit occurs.
            return False
        print(status)


        

class TweetAnalyzer():
    """
    Functionatlity for analyzing and categorizing content from tweets
    """
    
    def clean_tweet(self, tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
        # This function we will use to clean a particular tweet passed to it
        # This just removes special characters and hyperlinks etc.
        
        
    def analyze_sentiment(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))
        # This function we will use to run a sentiment analysis on a tweet using text_blob
        
        if analysis.sentiment.polarity > 0:                    # function to tell us the polarity of the tweet is positive or negative
            return 1                                           # Positive Tweet
        
        elif analysis.sentiment.polarity == 0:                 # Neutral Tweet
            return 0
            
        else:                                                  # Negative Tweet
            return -1
    
    
    def tweets_to_dataframe(self, tweets):
        
        
        
        df = pd.DataFrame(data=[tweet.created_at for tweet in tweets], columns=['Date'])  # passing tweet text to a Dataframe
        
        # I should really create to dataframes with a common column for Tweet and User here to avoid creating redundant data
        df['Tweet Text'] = np.array([tweet.text for tweet in tweets])
        df['Tweet Length'] = np.array([len(tweet.text) for tweet in tweets])
        df['Tweet ID'] = np.array([tweet.id for tweet in tweets])
        df['Favorite Count'] = np.array([tweet.favorite_count for tweet in tweets])
        df['Retweet Count'] = np.array([tweet.retweet_count for tweet in tweets])
        df['User Name'] = np.array([tweet.user.name for tweet in tweets])
        df['User ID'] = np.array([tweet.user.id for tweet in tweets])
        df['User Location'] = np.array([tweet.user.location for tweet in tweets])
        df['User Follower Count'] = np.array([tweet.user.followers_count for tweet in tweets])
        
        
        
        return df
        
        


In [35]:
if __name__ == '__main__':
 
    twitter_client = TwitterClient()                                                  # specifying a twitter client object
    tweet_analyzer = TweetAnalyzer()                                                  # specifying a tweet analyzer object
    api = twitter_client.get_twitter_client_API()
    
    
    

In [36]:
# Getting friends of lukeclarke21
# Returns a default of 20 and up to a max of 200 per page

screen_name = 'lukeclarke21'
luke_clarke_friends = set()

for friend in api.friends(screen_name, count=200): 
    luke_clarke_friends.add(friend.screen_name)
    

luke_clarke_friends



{'GiGiHadid',
 'KendallJenner',
 'KimKardashian',
 'KylieJenner',
 'khloekardashian',
 'kourtneykardash'}

In [37]:
# # # I set up a qtrt

tweets = []

for friend in luke_clarke_friends:
        tweets.append(api.user_timeline(screen_name=friend, count=8))
        

In [38]:
len(tweets)

6

In [39]:
# # # My tweets list was returning a list with lists of 2 tweets above so I need to flatten it 

flat_tweet_list = []
for sublist in tweets:
    for item in sublist:
        flat_tweet_list.append(item)
        
len(flat_tweet_list)

48

In [40]:

df = tweet_analyzer.tweets_to_dataframe(flat_tweet_list)                                     # Creating a dataframe from the tweet_analyzer object, that we pass to the tweets_to_dataframe function in the tweet analyzer class
df['Sentiment'] = np.array([tweet_analyzer.analyze_sentiment(tweet) for tweet in df['Tweet Text']])
df.head()

Unnamed: 0,Date,Tweet Text,Tweet Length,Tweet ID,Favorite Count,Retweet Count,User Name,User ID,User Location,User Follower Count,Sentiment
0,2021-02-03 04:52:03,That’s all 💋🧚🏼,14,1356827770978848769,8325,244,Khloé,32959253,,29032145,0
1,2021-02-03 04:51:53,🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼,18,1356827726187876354,6169,280,Khloé,32959253,,29032145,0
2,2021-02-02 06:24:01,I love you guys! I am off to dreamland 🧚🏼✨🌙🧚🏼,45,1356488526502658048,8602,225,Khloé,32959253,,29032145,1
3,2021-02-02 06:22:14,@kyliesbirkins @kourtneykardash She won’t,41,1356488075598127106,206,5,Khloé,32959253,,29032145,0
4,2021-02-02 06:20:33,@kourtneykardash Wait you’re here,33,1356487652317425664,1392,16,Khloé,32959253,,29032145,0


# Now Lets Import Spacy To Analyze Tweet Text

In [41]:
import spacy
import string
from spacy.lang.en import English
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

In [45]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [46]:
""" This could also have been used to create a new DataFrame column with spacy nlp applied to the corresponding Tweet Text in each row"""

#df['NLP Text'] = df['Twitter Text'].apply(lambda x: nlp(x))

' This could also have been used to create a new DataFrame column with spacy nlp applied to the corresponding Tweet Text in each row'

**Here, I am putting every all of the tweet text from every tweet in the dataframe into one string so I can analyze all the tweets together with spacy!**

In [47]:
df['Tweet Text Split'] = df['Tweet Text'].apply(lambda x : str(x).split("."))

# get string of all sentences
super_tweet_string = " ".join([" ".join(row) for row in df["Tweet Text Split"]])

super_tweet_string

"That’s all 💋\U0001f9da🏼 \U0001f9da🏼\U0001f9da🏼\U0001f9da🏼\U0001f9da🏼\U0001f9da🏼\U0001f9da🏼\U0001f9da🏼\U0001f9da🏼\U0001f9da🏼 I love you guys! I am off to dreamland \U0001f9da🏼✨🌙\U0001f9da🏼 @kyliesbirkins @kourtneykardash She won’t @kourtneykardash Wait you’re here @offthetabIe I don’t know  Are you tired or are you not @NarbehKardash You’re welcome @traevonceyah Don’t believe it for a second! Not with that face  💣 two pretty best friends https://t co/e9yXsrfmbQ launching today at 9am!! my new limited edition 8 Piece Mini Set features my best-selling Kylie Skin essentials, av… https://t co/MSbOWrbidI my Valentine’s Day shop is officially open! 💗 https://t co/h7emRJXnP8 everything &amp; more https://t co/j8S3Razck5 RT @chipswithpizza: Yay! My @KylieJenner @kyliecosmetics lip oil has arrived! Smells like coconut and the doe foot applicator is amazing \U0001f929… a love story https://t co/QERk28b5GG that’s my best friend https://t co/1T7Hmy2rj5 happy friday ✨ https://t co/89BTVDUFxc https:

In [48]:
fashion_tweets = nlp(super_tweet_string)

In [49]:
# Lets count the 5 most common words to see if it's a product type or brand name 

# Create our list of punctuation marks
punctuations = string.punctuation

char_counter = Counter()

for token in fashion_tweets:
    if token.pos_ == 'NOUN' and token.text not in STOP_WORDS and token.text not in punctuations and token.text != 'co' and token.text != 'https://t' and token.text != 'amp' and token.text != 'http://t' and token.text != '💕':
        char_counter[token.text] += 1

        
char_counter.most_common(35)

[('🏼', 9),
 ('day', 4),
 ('✨', 3),
 ('friends', 2),
 ('love', 2),
 ('collection', 2),
 ('loungewear', 2),
 ('\U0001f90d', 2),
 ('hair', 2),
 ('month', 2),
 ('items', 2),
 ('guys', 1),
 ('@kyliesbirkins', 1),
 ('second', 1),
 ('face', 1),
 ('today', 1),
 ('edition', 1),
 ('Piece', 1),
 ('Mini', 1),
 ('Set', 1),
 ('essentials', 1),
 ('av', 1),
 ('shop', 1),
 ('lip', 1),
 ('oil', 1),
 ('applicator', 1),
 ('\U0001f929', 1),
 ('story', 1),
 ('friend', 1),
 ('@kkwbeauty', 1),
 ('wait', 1),
 ('Announcement', 1),
 ('KKWBEAUTY', 1),
 ('FmIyl2m8BV', 1),
 ('n9egbstAzw', 1)]

The only relevant words I can see above are **'leather'** and **'jacket'**

In [50]:
# Top 20 most common lemmas

lemma_counter = Counter()

for token in fashion_tweets:
  if token.pos_ != "PROPN" and not token.is_punct and not token.is_digit and not token.is_space and token.text not in STOP_WORDS and token.text not in punctuations and token.text != 'co' and token.text != 'https://t' and token.text != 'amp' and token.text != 'http://t' and token.text != '💕':
    lemma_counter[token.lemma_] += 1

lemma_counter.most_common(10)


[('-PRON-', 18),
 ('🏼', 13),
 ('\U0001f9da', 10),
 ('new', 5),
 ('love', 4),
 ('good', 4),
 ('day', 4),
 ('✨', 3),
 ('friend', 3),
 ('favorite', 3)]

# Now, I want to build a machine learning model to analyze the sentiment of tweets

This is a dataset with 162,980 unique tweets where the sentiment of each tweet is labelled -1 (Negative), 0 (Neutral) and 1 (Positive)

Dataset: https://www.kaggle.com/cosmos98/twitter-and-reddit-sentimental-analysis-dataset?select=Twitter_Data.csv

# Step 1: Generating the Dataset (First Part Of ETL: Extract, Transform & Load)

In [53]:
import pandas as pd

# Reading the dataset with no columns titles and with latin encoding 
df_tweets = pd.read_csv("twitter_sentiment.csv")

df_tweets.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [55]:
df_tweets = df_tweets.rename(columns={'clean_text':'tweets', 'category':'sentiment'})
df_tweets

Unnamed: 0,tweets,sentiment
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
57848,and clarify you ardent karyakarta for trs when...,0.0
57849,during before announced feat scientists today ...,1.0
57850,why would act helps modi,0.0
57851,are with modi and modi with and you are useles...,-1.0


In [56]:
df_tweets.shape

(57853, 2)

In [57]:
# overall has the most non-null values which equate to the shape of our df_review dataframe

df_tweets.info

<bound method DataFrame.info of                                                   tweets  sentiment
0      when modi promised “minimum government maximum...       -1.0
1      talk all the nonsense and continue all the dra...        0.0
2      what did just say vote for modi  welcome bjp t...        1.0
3      asking his supporters prefix chowkidar their n...        1.0
4      answer who among these the most powerful world...        1.0
...                                                  ...        ...
57848  and clarify you ardent karyakarta for trs when...        0.0
57849  during before announced feat scientists today ...        1.0
57850                          why would act helps modi         0.0
57851  are with modi and modi with and you are useles...       -1.0
57852                                       you and your        NaN

[57853 rows x 2 columns]>

# Step 2: Cleaning The Dataset

In [58]:
df_tweets.isnull().sum()

tweets       1
sentiment    1
dtype: int64

In [59]:
df_tweets = df.dropna()

In [60]:
df_tweets.isnull().sum()

Date                   0
Tweet Text             0
Tweet Length           0
Tweet ID               0
Favorite Count         0
Retweet Count          0
User Name              0
User ID                0
User Location          0
User Follower Count    0
Sentiment              0
Tweet Text Split       0
dtype: int64

# Step 3: Tokenize The Data

For text data and especially for sentiment analysis we want tokenize it, in order to draw relationships and similarities and deal with it further, as we have seen we can so with tokens.

Also, Further Cleaning of what we do not need. E.g. STOP_WORDS, Punctuations etc.

In [61]:
import spacy
import string                                          # function where we can access all the puntuations in the english language
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


# create a list of punctuations
punctuations = string.punctuation


# Load english tokenizer, tagger, parser, NER, and word vectors
# A function that automatically tokenizes what we pass to it
parser = English()


# creating our tokenizer function

def spacy_tokenizer(sentence):
    my_tokens = parser(sentence)              
    
    # now we have all our tokens, but we need to remove all the unnecessary info e.g. stop words, punctuations
    
    
    # Lemmatizing each token and converting each token into lowercase
    my_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in my_tokens]
    
    # Removing stop words and punctuations
    my_tokens = [word for word in my_tokens if word not in STOP_WORDS and word not in punctuations]
    
    return my_tokens

# Step 4: Encode/Standardize dataset

Once we have all of our tokens identified we need to encode and standized the dataset. In other words we need to find ways to give meaning to the words by identifying words and drawing relationships and similarities from one word to others.

This will then allow us to draw conclusions and make classifications later on in this machine learning project.

In [62]:
from sklearn.base import TransformerMixin

# This function will clean any text I pass to it 

def clean_text(text):
     return text.strip().lower()
    
    
#Custom transformer using Python standard library (you could use spacy as well)
class predictors(TransformerMixin):

    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# TF-IDF

From the lectures we learned the TF-IDF (Term-Frequency - Inverse Document Frequency is a function that calculates a numerical statistic which is a way of calculating how important a word is to a document, a collection of documents of a corpus of text.

The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general.

# TF-IDF Vectorizer

Converts a collection of raw documents to a matrix of TF-IDF features. So, in our case, we will be able to pass "my_tokens" from each review to the TF-IDF vectorizer which will extract features from the text, which we can then user to later teach our model how to classify the text as (1, 2, 3, 4, 5) or (Very Bad, Poor, Good, Very Good, Excellent)

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using tf_idf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)




# Train-Test split
In machine learning, we always need to split our datasets into train and test. We will use one for training the model and another one to check how the model performs. Luckily, sklearn comes with an in-built function for this.

The split is done randomly, but we can attribute a seed value to make it stable for developing purposes. The usually split is 20% test and 80% train.

In [64]:
df_tweets

Unnamed: 0,Date,Tweet Text,Tweet Length,Tweet ID,Favorite Count,Retweet Count,User Name,User ID,User Location,User Follower Count,Sentiment,Tweet Text Split
0,2021-02-03 04:52:03,That’s all 💋🧚🏼,14,1356827770978848769,8325,244,Khloé,32959253,,29032145,0,[That’s all 💋🧚🏼]
1,2021-02-03 04:51:53,🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼,18,1356827726187876354,6169,280,Khloé,32959253,,29032145,0,[🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼]
2,2021-02-02 06:24:01,I love you guys! I am off to dreamland 🧚🏼✨🌙🧚🏼,45,1356488526502658048,8602,225,Khloé,32959253,,29032145,1,[I love you guys! I am off to dreamland 🧚🏼✨🌙🧚🏼]
3,2021-02-02 06:22:14,@kyliesbirkins @kourtneykardash She won’t,41,1356488075598127106,206,5,Khloé,32959253,,29032145,0,[@kyliesbirkins @kourtneykardash She won’t]
4,2021-02-02 06:20:33,@kourtneykardash Wait you’re here,33,1356487652317425664,1392,16,Khloé,32959253,,29032145,0,[@kourtneykardash Wait you’re here]
5,2021-02-02 06:19:31,@offthetabIe I don’t know. Are you tired or ar...,55,1356487394950668288,32,0,Khloé,32959253,,29032145,-1,"[@offthetabIe I don’t know, Are you tired or ..."
6,2021-02-02 06:19:08,@NarbehKardash You’re welcome,29,1356487295717703682,62,4,Khloé,32959253,,29032145,1,[@NarbehKardash You’re welcome]
7,2021-02-02 06:16:49,@traevonceyah Don’t believe it for a second! N...,66,1356486714240364545,102,2,Khloé,32959253,,29032145,0,[@traevonceyah Don’t believe it for a second! ...
8,2021-02-03 20:33:56,two pretty best friends https://t.co/e9yXsrfmbQ,47,1357064801587847168,201434,16994,Kylie Jenner,236699098,,37299394,1,"[two pretty best friends https://t, co/e9yXsrf..."
9,2021-02-02 16:23:20,launching today at 9am!! my new limited editio...,140,1356639349970882560,8719,272,Kylie Jenner,236699098,,37299394,1,[launching today at 9am!! my new limited editi...


In [67]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 47
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 48 non-null     datetime64[ns]
 1   Tweet Text           48 non-null     object        
 2   Tweet Length         48 non-null     int64         
 3   Tweet ID             48 non-null     int64         
 4   Favorite Count       48 non-null     int64         
 5   Retweet Count        48 non-null     int64         
 6   User Name            48 non-null     object        
 7   User ID              48 non-null     int64         
 8   User Location        48 non-null     object        
 9   User Follower Count  48 non-null     int64         
 10  Sentiment            48 non-null     int64         
 11  Tweet Text Split     48 non-null     object        
dtypes: datetime64[ns](1), int64(7), object(4)
memory usage: 4.9+ KB


In [70]:
# Running with Optimal DataFrame

from sklearn.model_selection import train_test_split

# Specifying our features & labels

features = df_tweets["Tweet Text"]
labels = df_tweets["Sentiment"]

In [71]:
features

0                                        That’s all 💋🧚🏼
1                                    🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼🧚🏼
2         I love you guys! I am off to dreamland 🧚🏼✨🌙🧚🏼
3             @kyliesbirkins @kourtneykardash She won’t
4                     @kourtneykardash Wait you’re here
5     @offthetabIe I don’t know. Are you tired or ar...
6                         @NarbehKardash You’re welcome
7     @traevonceyah Don’t believe it for a second! N...
8       two pretty best friends https://t.co/e9yXsrfmbQ
9     launching today at 9am!! my new limited editio...
10    my Valentine’s Day shop is officially open! 💗 ...
11        everything &amp; more https://t.co/j8S3Razck5
12    RT @chipswithpizza: Yay! My @KylieJenner @kyli...
13                 a love story https://t.co/QERk28b5GG
14        that’s my best friend https://t.co/1T7Hmy2rj5
15    happy friday ✨ https://t.co/89BTVDUFxc https:/...
16    RT @kkwbeauty: The wait is almost over! Announ...
17    I hope you have a great day ✨ https://t.co

In [72]:
labels

0     0
1     0
2     1
3     0
4     0
5    -1
6     1
7     0
8     1
9     1
10    0
11    1
12    1
13    1
14    1
15    1
16    0
17    1
18    0
19    0
20    0
21    1
22    1
23    1
24    0
25    0
26    1
27    1
28    1
29    1
30    0
31   -1
32    1
33   -1
34    0
35    0
36    1
37    0
38   -1
39   -1
40    1
41    1
42    0
43    0
44    0
45    1
46    1
47    0
Name: Sentiment, dtype: int64

In [73]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.2, random_state=42)



In [None]:
print("Features Train Shape ==>", features_train.shape)
print("Features Test Shape ==>", features_test.shape)
print("Labels Train Shape ==>",  labels_train.shape)
print("Labels Test Shape ==>", labels_test.shape)


# The classifier

With choosing a classifier, we are choosing the strategy for our model to learn. Since we are trying to do a classification (bad, neutral, good) we will need to choose algorithms that are classifiers.

We can use Sklearns built in multi-layer perceptron classifier. In our case as we are trying to assign each review to a certain class i.e. (-1, 0, 1) or (bad, neutral, good)

In [75]:
from sklearn.neural_network import MLPClassifier

# # Multi layer perceptron - Neural Network
classifier_MLP = MLPClassifier(max_iter=50, hidden_layer_sizes = (100, 2), verbose = True)



In [76]:
# SVC classifier
from sklearn.svm import LinearSVC

classifier_SVC = LinearSVC(verbose=True)

In [77]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

classifier_LG = LogisticRegression(verbose=True)

# Step 5: Build A Pipeline

Now we need to create an sklearn pipeline that:

- Cleans and preprocess the text using our predictors class from above
- Vectorizes the words with TF-IDF to create word matrixes from our text.
- Load the MLP classifier in order to classifier the sentiment of each review.

Pipeline Function:
    
A pipeline of transforms with a final estimator.

Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit.

In [78]:
from sklearn.pipeline import Pipeline

# Create the pipeline to clean, tokenize, vectorize and classify


                                                        # The train data passed through the pipeline is: 
pipe = Pipeline([("cleaner", predictors()),             # Cleaned and tokenized as it is passed through the predictors class
                 ("vectorizer", tfvectorizer),          # Then vectorized as it passes throught the tfvectorizer function we previously specified
                 ("classifier", classifier_SVC)])       # Classified according to the MLP Classifier which we also previously specified


                                                        # Note: The data passed to the Classifier must be previously vectorized
                                                        #  and, The data passed to the vectorizer must be previously cleaned and tokenized
                                                        # Thus, this method of this pipeline is very important








In [79]:
# Fitting our data 
# Watch the loss function decrease through each iteration as the neural network learns how to best adjust the weights and biases
pipe.fit(features_train, labels_train)

[LibLinear]

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f911969a160>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wor...
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7f9165c321e0>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_wei

In [80]:
# Now we pass our test_features to the model in order to make a predictions based on what it has learned in training
# in order to predict the test_labels

sample_prediction = pipe.predict(features_test)

In [None]:
for (sample, pred) in zip(features_test, sample_prediction):
    print(sample, "Prediction ==>", pred)

Now we can evaluate the model using different metrics, so that we can look at the three main performance metrics:

**Accuracy:** Refers to the percentage of the total predictions our model makes that are completely correct.

**Precision:** Describes the ratio of true positives to true positives plus false positives in our predictions.

**Recall:** Describes the ratio of true positives to true positives plus false negatives in our predictions.

In [None]:
from sklearn import metrics

# Model Accuracy

# Remember the model never accuracy_score the test data which we hold back.
# It is here where we can now compare the test data held back against the models predictions to measure its accuracy.
# I can't use recall and precision here as I am not using binary classes

print("Accuracy:",metrics.accuracy_score(labels_test, sample_prediction))

# Confusion Matrix

Based on the F1 - Score it appears that the model is good at predicting reviews that are 5-Stars, but then quite poor at predicting 1 star and 3 star reviews.

Probably, because there are a lot more examples of 5 Stars reviews in our dataset and because it is easier to capture the sentiment for five star reviews based on the words people use is a review when making a 5 star review oe a 1, 2 or 3 star review. 

This is skewing the heatmap for the confusion matrix.

In [None]:
from sklearn import metrics

print("Classification report for classifier %s:\n%s\n"
      % (classifier_MLP, metrics.classification_report(labels_test, sample_prediction)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels_test, sample_prediction))

In [None]:

import seaborn as sb
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(labels_test, sample_prediction                                        )
sb.heatmap(confusion_matrix, cmap=plt.cm.inferno)
plt.show()

# Repeat: Using MLP Classifier

I tried running running the above with the MLP classifier, however it was taking forever. So, now I will run the data through the MLP Classifier, however, I will use only a small portion of the initial data.

In [None]:
df_optimal = df_tweets[0:10000]
df_optimal

In [None]:
# Running with Optimal DataFrame

from sklearn.model_selection import train_test_split

# Specifying our features & labels

features = df_optimal["tweets"]
labels = df_optimal["sentiment"]

In [None]:
features

In [None]:
labels

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42)

In [None]:
print("Features Train Shape ==>", features_train.shape)
print("Features Test Shape ==>", features_test.shape)
print("Labels Train Shape ==>",  labels_train.shape)
print("Labels Test Shape ==>", labels_test.shape)



In [None]:
from sklearn.pipeline import Pipeline

# Create the pipeline to clean, tokenize, vectorize and classify


                                                        # The train data passed through the pipeline is: 
pipe_2 = Pipeline([("cleaner", predictors()),             # Cleaned and tokenized as it is passed through the predictors class
                 ("vectorizer", tfvectorizer),          # Then vectorized as it passes throught the tfvectorizer function we previously specified
                 ("classifier", classifier_MLP)])       # Classified according to the MLP Classifier which we also previously specified


                                                        # Note: The data passed to the Classifier must be previously vectorized
                                                        #  and, The data passed to the vectorizer must be previously cleaned and tokenized
                                                        # Thus, this method of this pipeline is very important







In [None]:
# Fitting our data 
# Watch the loss function decrease through each iteration as the neural network learns how to best adjust the weights and biases
pipe_2.fit(features_train, labels_train)

In [None]:
# Now we pass our test_features to the model in order to make a predictions based on what it has learned in training
# in order to predict the test_labels

sample_prediction = pipe_2.predict(features_test)

In [None]:
for (sample, pred) in zip(features_test, sample_prediction):
    print(sample, "Prediction ==>", pred)

In [None]:
from sklearn import metrics

# Model Accuracy

print("Accuracy:",metrics.accuracy_score(labels_test, sample_prediction))



In [None]:
# Confusion Matrix

from sklearn import metrics

print("Classification report for classifier %s:\n%s\n"
      % (classifier_MLP, metrics.classification_report(labels_test, sample_prediction)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(labels_test, sample_prediction))

In [None]:

import seaborn as sb
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(labels_test, sample_prediction                                        )
sb.heatmap(confusion_matrix, cmap=plt.cm.inferno)
plt.show()

**Not As Good.........**

# At Last

I want to push all the the tweets I previously pulled from twitter into a dataframe through the SVC model ( Better Model ) and predict the sentiment.

I will then compare the sentiment scores that the SVC model has predicted with that of the text_blob sentiment predictor I initially ran on the tweets.

In [None]:
df.head()

In [None]:
tweet_prediction = pipe.predict(df['Tweet Text'])

In [None]:
tweet_prediction = list(tweet_prediction)


In [None]:
df['Tweet Prediction'] = tweet_prediction

In [None]:
df_two_preds = df[['Tweet Text', 'Sentiment', 'Tweet Prediction']]
df_two_preds

In [None]:
c = 0

for i in range(df_two_preds.shape[0]):
    if df_two_preds['Sentiment'][i] == df_two_preds['Tweet Prediction'][i]:
        
        c = c + 1
        
print(f"Both Models print the same value {round(c/df_two_preds.shape[0], 2)*100} percent of the time")
        
    