In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/f0/3a/1b46a0220d6176b22bcb9336619d1731301bc2c75fa926a9ef953e6e4d58/flair-0.8.0.post1-py3-none-any.whl (284kB)
[K     |█▏                              | 10kB 22.6MB/s eta 0:00:01[K     |██▎                             | 20kB 29.4MB/s eta 0:00:01[K     |███▌                            | 30kB 22.2MB/s eta 0:00:01[K     |████▋                           | 40kB 25.7MB/s eta 0:00:01[K     |█████▊                          | 51kB 24.4MB/s eta 0:00:01[K     |███████                         | 61kB 26.9MB/s eta 0:00:01[K     |████████                        | 71kB 17.9MB/s eta 0:00:01[K     |█████████▏                      | 81kB 19.1MB/s eta 0:00:01[K     |██████████▍                     | 92kB 17.8MB/s eta 0:00:01[K     |███████████▌                    | 102kB 17.9MB/s eta 0:00:01[K     |████████████▋                   | 112kB 17.9MB/s eta 0:00:01[K     |█████████████▉                  | 122kB 17

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from flair.models import TextClassifier
from flair.data import Sentence



In [None]:
DATASET_ENCODING = "ISO-8859-1"

In [None]:
colnames = ['target','ids','date','flag','user','text']
df = pd.read_csv('/content/drive/MyDrive/FA PROJECT 2021/kaggle_tweets.csv', names = colnames, header = None, encoding = DATASET_ENCODING)
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
#drop irrelevant columns
df = df.drop(columns=['ids','date','flag','user'])
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
#check no missing data
df.isnull().values.any()

False

In [None]:
#target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
# remove neutral rows
df = df[df["target"] != 2]

In [None]:
# change target 4 to 1 --> so that 0 means negative, 1 means positive
df['target'] = df['target'].replace(4, 1)

In [None]:
df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...


In [None]:
!pip install emoji

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |██▌                             | 10kB 20.0MB/s eta 0:00:01[K     |█████                           | 20kB 27.0MB/s eta 0:00:01[K     |███████▌                        | 30kB 22.2MB/s eta 0:00:01[K     |██████████                      | 40kB 20.4MB/s eta 0:00:01[K     |████████████▌                   | 51kB 17.3MB/s eta 0:00:01[K     |███████████████                 | 61kB 19.4MB/s eta 0:00:01[K     |█████████████████▌              | 71kB 16.3MB/s eta 0:00:01[K     |████████████████████            | 81kB 16.5MB/s eta 0:00:01[K     |██████████████████████▌         | 92kB 16.1MB/s eta 0:00:01[K     |█████████████████████████       | 102kB 16.3MB/s eta 0:00:01[K     |███████████████████████████▌    | 112kB 16.3MB/s eta 0:00:01[K     |██████████████████████████████  | 122kB 16.3MB/s

In [None]:
import re
import emoji

def cleaner(tweet):
    tweet = tweet.lower()
    if tweet[:2] == 'rt':
        tweet = re.sub(r'^.*?:', ':', tweet)
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ")
    tweet = tweet.replace("RT ", "")
    tweet = re.sub(r'[^\w\s]','',tweet) #remove punctuation
    tweet = re.sub(r"(^|\W)\d+", "", tweet)# remove digits
    return tweet
df['processed_tweet'] = df['text'].map(lambda x: cleaner(x))

In [None]:
df.head()

Unnamed: 0,target,text,processed_tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",a thats a bummer you shoulda got david carr o...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...


In [None]:
#split into train and test
train_df,test_df = train_test_split(df,test_size=0.2)

In [None]:
train_df

Unnamed: 0,target,text,processed_tweet
162667,0,I hope I get the job so I can have some steady...,i hope i get the job so i can have some steady...
1473936,1,Listenin 2 beat again !!! x,listenin beat again x
1207393,1,i love midnight talks with debbie she's the b...,i love midnight talks with debbie shes the bes...
852787,1,Im gonna see Raphael Saadiq perform live tonight,im gonna see raphael saadiq perform live tonight
1281482,1,@djsamyoung send me some beats !! Also fotos f...,send me some beats also fotos from b8
...,...,...,...
1270522,1,@wanesaleto thanks how are you today?,thanks how are you today
1465876,1,The sun is shining &amp; so am I.. Bout to sta...,the sun is shining amp so am i bout to start m...
903769,1,"like spirits in the night ooooh night, quï¿½ g...",like spirits in the night ooooh night quï½ gra...
715399,0,"bloody key card system took a shit,people cant...",bloody key card system took a shitpeople cant ...


In [None]:
test_df

Unnamed: 0,target,text,processed_tweet
603899,0,Hmm coffee shop girls appear to have gone off ...,hmm coffee shop girls appear to have gone off me
821818,1,I feel better now,i feel better now
354733,0,watching tennis.....I missed so much,watching tennisi missed so much
901417,1,"If anyone needs help with images, let me know ...",if anyone needs help with images let me know a...
917643,1,LOVE ME LOVE ME SAY THAT YOU LOVE ME...,love me love me say that you love me
...,...,...,...
287578,0,"@MelissaLoschy Awesome! Yea, Its going to be r...",awesome yea its going to be really good i was ...
103981,0,I need to get away from here for awhile. Even ...,i need to get away from here for awhile even i...
285181,0,had to cancel the beach trip today. it's supp...,had to cancel the beach trip today its suppose...
1401609,1,@Rocks4Ever bet u say that 2all the girls Wot...,bet u say thatall the girls wots this gas comi...


## Textblob

In [None]:
def text_sentiment(text):
    testimonial = TextBlob(text)
    return int(testimonial.sentiment.polarity>0.5)

predictions = test_df.text.map(lambda x :  text_sentiment(x))

In [None]:
accuracy_score(predictions,test_df.target)

0.562971875

In [None]:
!pip install nltk



## Vader

In [None]:
import nltk
nltk.downloader.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

def text_sentiment_vader(text):
    vs = analyzer.polarity_scores(text)
    return int(vs.get("compound")>0)
 
predictions = test_df.text.map(lambda x : text_sentiment_vader(x))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
accuracy_score(predictions,test_df.target)

0.6492625

## Flair

In [None]:
classifier = TextClassifier.load('en-sentiment')

def text_sentiment_flair(text):
    sentence = Sentence(text)
    classifier.predict(sentence)
    return np.round(sentence.labels[0].score)

predictions = test_df.text.map(lambda x : text_sentiment_flair(x))

2021-03-11 03:22:22,840 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpn26ctjel


100%|██████████| 265512723/265512723 [00:09<00:00, 27335798.48B/s]

2021-03-11 03:22:32,868 copying /tmp/tmpn26ctjel to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2021-03-11 03:22:33,430 removing temp file /tmp/tmpn26ctjel
2021-03-11 03:22:33,458 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
accuracy_score(predictions,test_df.target)

0.4992