In [1]:
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import random

import re                            
import string
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

### Dataset

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to C:\Users\Melis
[nltk_data]     Nur\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [5]:
negative_tweets[0]

'hopeless for tmr :('

In [6]:
print("Length of positive tweets: ", len(positive_tweets))
print("Length of negative tweets: ", len(negative_tweets))
print("Type of a tweet is ", type(positive_tweets))
print("Type of an entry is ",type(negative_tweets[0]))

Length of positive tweets:  5000
Length of negative tweets:  5000
Type of a tweet is  <class 'list'>
Type of an entry is  <class 'str'>


In [7]:
print('\033[92m' + positive_tweets[random.randint(0,5000)])

print('\033[91m' + negative_tweets[random.randint(0,5000)])

[92mMe + Bed = Best couple ever :)
[91mSuch a stressful and upsetting day yesterday, the UK government sucks :(


# Preprocess raw text for Sentiment analysis

In [8]:
tweet = positive_tweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Melis
[nltk_data]     Nur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Remove hyperlinks, Twitter marks and styles

In [10]:
positive = pd.DataFrame()
positive['tweet'] = positive_tweets
positive['state'] = 1
positive

Unnamed: 0,tweet,state
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1
1,@Lamb2ja Hey James! How odd :/ Please call our...,1
2,@DespiteOfficial we had a listen last night :)...,1
3,@97sides CONGRATS :),1
4,yeaaaah yippppy!!! my accnt verified rqst has...,1
...,...,...
4995,"@chriswiggin3 Chris, that's great to hear :) D...",1
4996,@RachelLiskeard Thanks for the shout-out :) It...,1
4997,@side556 Hey! :) Long time no talk...,1
4998,@staybubbly69 as Matt would say. WELCOME TO AD...,1


In [11]:
negative = pd.DataFrame()
negative['tweet'] = negative_tweets
negative['state'] = 0
negative

Unnamed: 0,tweet,state
0,hopeless for tmr :(,0
1,Everything in the kids section of IKEA is so c...,0
2,@Hegelbon That heart sliding into the waste ba...,0
3,"“@ketchBurning: I hate Japanese call him ""bani...",0
4,"Dang starting next week I have ""work"" :(",0
...,...,...
4995,I wanna change my avi but uSanele :(,0
4996,MY PUPPY BROKE HER FOOT :(,0
4997,where's all the jaebum baby pictures :((,0
4998,But but Mr Ahmad Maslan cooks too :( https://t...,0


In [12]:
tweets = pd.concat([positive, negative], axis=0)
tweets = tweets.sample(frac=1).reset_index(drop=True)
tweets

Unnamed: 0,tweet,state
0,Math test result :((,0
1,@iamnonexistent oh dear :( that discount though!,0
2,Google has made Narendra Modi really very sad ...,1
3,@IanHallard Wonderful news! Best of luck with ...,0
4,"""Asshole"" 😂 That's an honest tribute :) https:...",1
...,...,...
9995,@CellarDoorExp Fingers crossed for you :),1
9996,"@storpey don't die, you're actually pretty ent...",1
9997,"@zora_db Just texted you, I am dying of lurgy,...",0
9998,@emily_etc I always walk past this place! May ...,1


In [13]:
def remove_hyperlinks(text):
    new_tweets = []
    for t in text:
        t = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', t)
        t = re.sub(r'#','',t)
        new_tweets.append(t)
    
    return new_tweets

In [14]:
tweets['tweet'] = remove_hyperlinks(tweets['tweet'])

## Tokenize the string

In [15]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

In [16]:
def tokenize(text):
    
    tweet_tokens = []
    for twt in text:
        tokens = tokenizer.tokenize(twt)
        tweet_tokens.append(tokens)
    
    
    return tweet_tokens

In [17]:
tweet_tokens = tokenize(tweets['tweet'])
tweet_tokens[10]

['great',
 'to',
 'hear',
 '!',
 'we',
 'are',
 'glad',
 'that',
 'we',
 'could',
 'help',
 ':)',
 'localgaragederby']

In [18]:
tweets['tweet'] = tweets['tweet'].str.replace('[{}]'.format(string.punctuation), '')
stop_words = stopwords.words('english')
tweets['tweet'] = tweets['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [19]:
tweets.head(10)

Unnamed: 0,tweet,state
0,Math test result,0
1,iamnonexistent oh dear discount though,0
2,Google made Narendra Modi really sad Imran Kha...,1
3,IanHallard Wonderful news Best luck x Shame Il...,0
4,Asshole 😂 Thats honest tribute,1
5,building house,1
6,Woke another migraine Need proper rest moment ...,0
7,ARNSUK thank lovely specialist breathing people,1
8,BBCLookEast Hope find new homes,0
9,ZaloraID hi im interested internship program B...,1


## Stemming

def stemm(text):
    stemmer = PorterStemmer() # stemming class

    tweets_stem = []

    for tweet in text:
        print("Tweet: ",tweet)
        for word in tweet.split():
            print("Word: ",word)
            stemmed_word = stemmer.stem(word)
            print("Stemmed word:",stemmed_word)
            tweets_stem.append(stemmed_word)
            
    print("List stemmed tweets:",tweets_stem)        
    return tweets_stem

In [20]:
def stemm(text):
    stemmer = PorterStemmer() # stemming class

    tweets_stem = []
    stemmed_twt = []

    for tweet in text:
        #print(tweet)
        for t in tweet.split():
            #print(t)
            stemmed_word = stemmer.stem(t)
            tweets_stem.append(stemmed_word)
        
        stemmed_twt.append(tweets_stem)
        res = [' '.join(ele) for ele in stemmed_twt]
        tweets_stem = []
                    
    return res

In [21]:
# stemmer = PorterStemmer() # stemming class
messages = ["Been since like 7am Im still tired body needs.",
            "virginmedia long till Internet go Was late pay..",
            "I need stop hardheaded."]
tweets['stemm'] = stemm(tweets['tweet'])

In [22]:
tweets.head(10)

Unnamed: 0,tweet,state,stemm
0,Math test result,0,math test result
1,iamnonexistent oh dear discount though,0,iamnonexist oh dear discount though
2,Google made Narendra Modi really sad Imran Kha...,1,googl made narendra modi realli sad imran khan...
3,IanHallard Wonderful news Best luck x Shame Il...,0,ianhallard wonder news best luck x shame ill miss
4,Asshole 😂 Thats honest tribute,1,asshol 😂 that honest tribut
5,building house,1,build hous
6,Woke another migraine Need proper rest moment ...,0,woke anoth migrain need proper rest moment i cant
7,ARNSUK thank lovely specialist breathing people,1,arnsuk thank love specialist breath peopl
8,BBCLookEast Hope find new homes,0,bbclookeast hope find new home
9,ZaloraID hi im interested internship program B...,1,zaloraid hi im interest internship program but...
