In [1]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK

In [4]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\onkar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [5]:
#  import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\onkar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [7]:
print('Number of positive tweets:', len(all_positive_tweets))
print('Number of negative tweets:', len(all_negative_tweets))

print('\nThe type of all_positive_tweets is:', type(all_positive_tweets))
print('The type of a tweet entry is:', type(all_negative_tweets[0]))

Number of positive tweets: 5000
Number of negative tweets: 5000

The type of all_positive_tweets is: <class 'list'>
The type of a tweet entry is: <class 'str'>


# Splitting Data
#### Train : Test :: 80 : 20

In [8]:
n = int(len(all_positive_tweets)*0.8)
test_pos = all_positive_tweets[n:]
train_pos = all_positive_tweets[:n]
test_neg = all_negative_tweets[n:]
train_neg = all_negative_tweets[:n]

In [9]:
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [10]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_statements(statement):
    """Process statement function.
    Input:
        statement: a string containing a statement
    Output:
        statements_clean: a list of words containing the processed statement

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    statement = re.sub(r'\$\w*', '', statement)
    
    # remove old style retweet text "RT"
    statement = re.sub(r'^RT[\s]+', '', statement)
    
    # remove hyperlinks
    statement = re.sub(r'https?:\/\/.*[\r\n]*', '', statement)
    
    # remove hashtags
    # only removing the hash # sign from the word
    statement = re.sub(r'#', '', statement)
    
    # tokenize statements
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    statement_tokens = tokenizer.tokenize(statement)

    statements_clean = []
    for word in statement_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # statements_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            statements_clean.append(stem_word)

    return statements_clean

In [11]:
def get_freqs(data):
    """Process statement function.
    Input:
        data: an array of strings containing a statement
    Output:
        freq_dict: a dictionary of words representing their frequencies

    """
    freq_dict = {}
    for i in data:
        for word in process_statements(i):
            if word in freq_dict:
                freq_dict[word]+=1
            else:
                freq_dict[word]=1
    return freq_dict

In [12]:
dict_of_positives = get_freqs(train_pos)
dict_of_negatives = get_freqs(train_neg)

In [13]:
pos = dict_of_positives.keys()
neg = dict_of_negatives.keys()

res = pos | neg
len(res)

9085

In [14]:
resultant_dict = {}
for i in res:
    pos_value = dict_of_positives.get(i,0)
    neg_value = dict_of_negatives.get(i,0)
    x = pos_value / (pos_value+neg_value)
    
    resultant_dict[i] = x

In [50]:
def check(s):
    cleaned = process_statements(s)
    
    arr = []
    for i in cleaned:
        x = resultant_dict.get(i, 0.5)
        arr.append(x)
        
    print(cleaned)
    print(np.mean(arr))
    
    return np.mean(arr)

In [53]:
s = 'Wonderful, loved it! :)'
check(s)

['wonder', 'love', ':)']
0.8220976910632083


0.8220976910632083

In [17]:
import pickle

In [18]:
def save_data(file, data):
    with open(file, 'wb') as f:
        pickle.dump(data, f)

In [19]:
def load_data(file):
    with open(file, 'rb') as f:
        return pickle.load(f)

In [22]:
file = 'model_1.pkl'
# save_data(file, resultant_dict)

In [23]:
# x = load_data(file)

In [24]:
# x

{'surrey': 1.0,
 'barefoot': 0.0,
 'adio': 0.0,
 'orphan': 0.0,
 'shaaa': 0.0,
 'tommorow': 1.0,
 'caro': 1.0,
 'monkey': 0.0,
 'gent': 1.0,
 'pausetim': 1.0,
 'outi': 1.0,
 'img': 1.0,
 'jenna': 1.0,
 'retard': 1.0,
 'fresherstofin': 0.0,
 'shek': 0.0,
 'strang': 1.0,
 'kingdom': 0.5,
 'peek': 1.0,
 'katamari': 1.0,
 'coconut': 1.0,
 'ohgod': 0.0,
 'ffvi': 0.0,
 '5pm': 1.0,
 'cyber': 0.5,
 'thanq': 1.0,
 'birthdaymonth': 1.0,
 '☆': 1.0,
 'meant': 0.3333333333333333,
 'kayla': 0.0,
 'an': 0.3333333333333333,
 'lrka': 0.0,
 'controversi': 0.0,
 'away': 0.3333333333333333,
 'najam': 1.0,
 'conflict': 1.0,
 'everyth': 0.4166666666666667,
 'x11': 1.0,
 'straight': 0.0,
 'aaahh': 0.0,
 'rememb': 0.45,
 'usernam': 1.0,
 'smoak': 1.0,
 'book': 0.4666666666666667,
 'skrillah': 0.0,
 'brambhatt': 1.0,
 'mental': 0.2,
 'angel': 0.6666666666666666,
 'ion': 0.0,
 '😬': 0.0,
 'cass': 1.0,
 'freez': 0.0,
 '):': 0.5,
 'lowest': 0.0,
 'caribbean': 0.0,
 'unam': 1.0,
 '__': 0.3333333333333333,
 'l': 0.5

In [54]:
pos_values = []
for i in test_pos:
    val = check(i)
    pos_values.append(val)

['bro', 'u', 'wan', 'cut', 'hair', 'anot', 'ur', 'hair', 'long', 'liao', 'bo', 'sinc', 'ord', 'liao', 'take', 'easi', 'lor', 'treat', 'save', 'leav', 'longer', ':)', 'bro', 'lol', 'sibei', 'xialan']
0.491346383781463
['back', 'thnx', 'god', "i'm", 'happi', ':)']
0.6153863338241786
['thought', 'ear', 'malfunct', 'thank', 'good', 'clear', 'one', 'apolog', ':-)']
0.6680242280845263
['stuck', 'centr', 'right', 'clown', 'right', 'joker', 'left', '...', ':)']
0.42509515743363446
['happi', 'friday', ':-)']
0.9364804469273743
['follow', ':)', 'x']
0.717712043431693
['teenchoic', 'choiceinternationalartist', 'superjunior', 'fight', 'oppa', ':D']
0.673611111111111
['birthday', 'today', 'birthday', 'wish', 'hope', "there'", 'good', 'news', 'ben', 'soon', ':-)']
0.5791146550619642
['good', 'morn', ':-)', 'friday', '\U000fec00', 'plan', 'day', 'current', 'play', 'shop', '...']
0.6519556093022684
['happi', 'friday', ':)']
0.9362464466933741
['3', 'good', 'nigth', ':)', 'estoy', 'escuchando', 'enemi'

['love', ':-)']
0.8620689655172413
['want', 'look', 'good', ':)']
0.6369547671546105
['sure', ':)']
0.7932997932997934
['excit', 'weekend', ':-)']
0.8729674796747968
['b3dk', 'far', '7an', 'ank', 'mi', '15', ':)']
0.5665663808520952
['god', 'give', 'us', 'strength', 'purpos', 'help', 'other', 'struggl', ':-)']
0.5617744999984884
['lol', 'meant', 'lucki', 'eagl', 'thank', 'servic', 'oceana', ':-)']
0.6440198359027836
['woke', 'feel', 'incred', 'sick', 'idk', 'caus', 'drank', 'starbuck', '11', "o'clock", 'last', 'night', 'reaction', 'med', ':)']
0.4722531154848228
['oop', '...', "that'", 'call', 'fridayfauxpa', ':)', "i'll", 'get', 'chang', 'right', 'thank', 'g']
0.5847945282492789
['fulfil', 'fantasi', ':)', '👉']
0.9581578331578332
['call', 'night', 'go', 'sleep', ':)']
0.5808356366104125
['yeah', "they'r", 'give', 'subtl', 'hint', 'also', 'tip', '2', 'use', 'commun', 'last', 'year', ':-)', 'prim', '’', 'algorithm', 'iii']
0.5775139167722287
['funni', 'ye', 'soon', 'alway', 'go', 'soon'

['smile', 'sunnah', ':)', 'غردلي', 'عن']
0.9677167427167428
['hi', 'see', 'u', 'like', 'fourfivesecond', 'think', 'u', 'might', 'like', 'deaf', 'ear']
0.6550670590009693
['share', 'readi', ':)']
0.8485700375101297
['oh', 'white', 'rabbit', 'cutest', 'thing', "i'v", 'ever', 'seen', 'wait', 'ador', ':)']
0.5218412011643646
['welcom', ':)', 'love', 'weekend', 'izzi']
0.7765185213290016
['happi', 'birthday', 'mitch', ':-)', 'wish', 'best', 'good', 'day']
0.6943704473005217
['thank', ':)']
0.9210536819232471
['done', 'ya', 'minn', ':)', 'cjradacomateada']
0.7644573010090252
['welcom', 'fun', ':)']
0.8689533096312757
['recently.websit', 'upgrad', 'might', 'see', 'new', 'pictur', 'coolingtow', 'soon.thank', 'showinginterest', ':)']
0.599749657491593
['flipkartfashionfriday', 'multicolor', 'maxi', 'dress', 'pair', 'wid', 'wedg', 'make', 'ur', 'life', 'bright', 'color', ':)']
0.6929184423560331
['alway', 'motiv', ':-)']
0.5
['nnnnot', 'see', 'upsid', ':-)']
0.775
['haha', 'kyle', "gf'", 'babi',

['inde', ':-)']
1.0
[':)', 'good', 'afternoon', ':D', 'twitterfollowerswhatsup', 'happyfriedday', 'keepsafealway', 'loveyeah', 'emojasp_her']
0.6637951187829533
['okay', ':)']
0.757982332982333
['make', 'two', 'us', ':D']
0.6758868794017912
['hurt', 'read', "people'", 'holiday', 'work', ':D']
0.5426316353947933
['nice', 'hear', 'want', 'switch', 'us', ':)', 'pro', 'kit', 'would', 'best', 'choic', 'follow', 'link']
0.6010995709356447
['beauti', '...', 'winter', 'like', 'summer', 'russia', ':)']
0.571461054331272
['thought', '’', 'like', 'mate', ':)']
0.6216779338730558
["i'm", 'play', 'brain', 'dot', 'braindot']
0.6930006049606776
['stat', 'week', 'arriv', '1', 'new', 'follow', 'unfollow', ':)', 'via']
0.8263445545056768
['dm', ':p']
0.8148148148148149
['also', 'come', 'game', ':)']
0.6418383887133887
['thank', ':)']
0.9210536819232471
[':)', 'hope', 'enjoy']
0.81461485597515
['ok', 'first', 'time', 'chat', 'made', 'joke', 'lol', 'believ', 'wont', 'forget', 'u', ':)', 'name']
0.55026780

['good', 'luck', '...', 'anoth', 'potenti', 'favourit', 'water', 'hole', ':-)']
0.6625210422999557
['bad', 'boy', ':)', 'burger', 'melbourneburg']
0.5540795487087047
['want', 'ft', 'arianna', ':-)']
0.6929133858267716
['everyon', 'follow', ':)']
0.7597288501543821
['hey', 'lesley', 'sorri', 'get', 'shall', 'send', 'post', 'tonight', ':)']
0.6594331570418528
['thank', 'esai', ':-)']
0.7809364548494983
['thank', 'text', 'back', ':)', "i'm", 'text', 'tomorrow', ':)']
0.6400406445767165
['unfollow', 'back', ':)']
0.7661276721513092
['hope', 'enjoy', 'stay', 'rotterdam', 'know', 'cheer', 'jordi', 'clasi', ':)']
0.6448209529343348
['hi', 'hot', 'girl', 'say', 'hot', 'horni', 'darl', 'xx', ':)', '♥']
0.595064602866327
['sure', 'done', ':)']
0.7587515863377933
['salon', 'bleach', 'hair', 'olaplex', 'damag', 'like', ':)']
0.4729172704479477
['teamwork', 'right', ':D', 'zitecofficestori']
0.620253164556962
['ff', 'happyfriday', 'great', 'friday', ':-)']
0.9436839536302625
['sure', ':)']
0.793299

['rt', 'bailona', 'group', 'chat', 'mention', 'approv', 'fanbas', ':)']
0.7089219901719902
['parti', 'cancel', ':p', 'bajrangibhaijaanhighestweek', '1']
0.6361092604930046
['thank', 'follow', ':-)', 'hope', 'great', 'week']
0.7444507743204173
['puff', 'pastri', 'egg', 'tart', 'hot', 'fresh', 'oven', ':)', 'wan', 'chai', 'mtr']
0.5853593792465266
['thank', 'follow', 'us', ':)', 'like', 'cool', 'new', 'product', 'check', 'campaign']
0.7237341325203628
['make', 'alyssa', 'rub', 'tummi', ':)']
0.3866520526897886
['almost', 'done', 'master', 'sword', ':D', 'princess', 'zelda']
0.7461412151067324
['ive', 'email', 'regard', 'cours', 'queri', ':)']
0.6329848517348518
['happi', 'birthday', '🎂', 'jiva', 'ever', ':)', '🍹', '🍸', '🍻']
0.5442970515474767
['awesom', 'news', 'mate', 'well', 'happi', ':)']
0.755421055523036
['jumma', 'mubbarak', ':)', '❤', '❤']
0.7331929331929332
['gorgeou', 'deborah', 'good', 'tast', ':)', 'use', 'coupon', 'code', 'colourdeb', 'red', 'purpl', 'blue']
0.638322529563405

In [55]:
neg_values = []
for i in test_neg:
    val = check(i)
    neg_values.append(val)

['help', '...', 'stop', 'cri', ':(']
0.2688610771158464
['otl', 'nevermind', ':(', 'least', 'got', 'jeon']
0.31775775772058784
['soon', 'tweet', 'plant', 'claw', 'thigh', 'traction', 'zoom', 'away', ':(']
0.4497169347715199
['damnit', ':(']
0.25013646288209607
['use', 'pri', 'pv', '...', 'wish', 'could', 'reliv', 'day', 'becom', 'nyc', 'pv', 'buy', 'way', 'commun', 'nyc', 'usa', 'klm', ':(']
0.5874243583897542
['realli', 'hot', ':-(']
0.36006257226416377
['monday', ':(']
0.25013646288209607
['go', 'stop', 'breakfast', 'earli', 'might', 'want', 'remov', '11am', 'websit', 'even', "mcd'", 'pull', 'trick', ':(']
0.46944233888795195
['mean', 'way', ':(', '3rd', 'load', 'hung']
0.29923008336368523
[':(', 'wtf', 'suppos', 'without']
0.26611597679913823
['headach', 'strike', ':(']
0.1667576419213974
['english', 'weather', 'need', 'fix', ':(']
0.28473122607947937
['live', 'fam', 'bam', 'cough', ':(']
0.5478323629306162
['absolut', 'gut', 'jame', 'bay', 'ticket', 'sold', 'manchest', ':(']
0.2991

['food', 'kitchen', 'money', 'wallet', 'thank', 'barcelona', 'dad', 'home', 'month', 'beet', 'juic', 'either', ':(']
0.41100211967501493
['dci', 'today', 'wish', 'go', ':-(']
0.3416282722426162
['work', 'freez', ':(']
0.1544769401670114
['srsli', 'u', ':(']
0.2136023960832247
['hate', 'see', 'granddad', 'like', ':(']
0.35474293203630725
['sorri', 'pre', ':(']
0.2000909752547307
['want', 'minion', 'bucket', 'pleas', ':(', 'yesterday', ':(']
0.26108138872909004
['fair', 'love', 'war', 'kapan', 'updat', ':(', 'oh', 'ya', 'udah', 'dihapu', 'hilang', 'dari', 'muka', 'bumi', 'want', 'read', 'someon', 'give', 'link', '😢']
0.43188869120536666
['girl', 'quick', ':(']
0.42231319747695295
['mani', 'nasti', 'narrow', 'mind', 'peopl', ':(']
0.5379471284701827
['want', 'room', ':-(']
0.19311528366646477
['friend', 'respect', 'life', ':-(', "i'm", 'sorri']
0.31723927413582587
['sadli', ':(']
0.04559100833664152
['suck', "i'm", 'gona', 'miss', 'chello', 'parti', ':(']
0.30261052560918883
['hello', 'in

['want', 'watch', ':(']
0.2104923436230105
['last', 'day', 'indiana', ':(']
0.2637171793923216
['fav', 'chees', 'give', 'migrin', ':(']
0.5217937155876211
['still', 'sad', 'fix', 'car', 'window', 'bc', "can't", 'drive', 'anywher', 'without', 'scare', "it'll", 'shatter', ':(']
0.3278339464458133
['morn', 'work', 'unfortun', ':(', 'gdce', 'gamescom']
0.3892805946256303
['hay', 'idk', 'babi', ':(', '😭', '😭', '😭']
0.08072682008271527
['oh', ':(', "do't", 'think', 'parcel', 'come', 'yodel', 'hit', 'follow', 'dm', 'track', 'num', 'chelsea']
0.5182395008408118
['wish', 'could', 'friend', 'everybodi', ':(', 'lmfaooo']
0.23597770948636068
['oooouch', 'poor', 'pinki', 'toe', '👣', 'good', 'thing', 'work', 'podiatrist', 'wait', 'morn', 'go', 'hurri', 'morn', ':(']
0.4614444630868059
['morn', 'miss', ':(']
0.27131064100564645
['miss', 'old', 'hous', 'could', 'hear', 'parent', 'door', 'open', 'act', 'like', "i'm", 'asleep', ':(']
0.36935768273112873
['gusto', 'ko', 'ng', "rodic'", ':(', 'someon', 's

['wonder', 'happen', 'earlier', 'realiz', 'leadership', ':-(', 'wakeupgop']
0.4758658008658009
['ah', 'dude', ':(']
0.27323912340287887
['wanna', 'take', 'adam', 'candi', 'eat', ':(']
0.3047509167109039
['pleas', 'watch', "infinite'", 'mv', ':(']
0.2215003012275593
["can't", 'believ', 'ill', 'go', 'work', 'today', 'wish', ':(']
0.32332332557693655
[':(', 'hate', 'school', 'u', 'dont', 'understand', 'miss', 'lot', 'blame', 'school']
0.317712630662283
['like', "can't", 'actual', 'put', 'pressur', 'ankl', 'hop', 'around', 'hous', 'lost', 'balanc', 'fell', ':(']
0.24695219121207565
['bull', 'shark', ':(', 'late', 'trafficcc']
0.2667212518195051
['miss', 'oscar', ':(']
0.35555972525473073
['need', 'translat', ':(']
0.25084172600548144
['last', 'time', 'one', 'work', ':(', 'fb', 'gone', 'check', 'long', 'time', 'lol']
0.42561220834831576
['asian', 'ummm', 'mayb', 'ur', 'film', 'child', 'pornographi', 'slutsham', 'fake', 'suicid', 'ect', ':-(']
0.5091435185185185
['poland', 'faraway', 'german

[':-(', 'pleas', 'notic', 'men']
0.39525283797729616
['watch', 'us', ':(']
0.36456481843462685
['regret', 'regret', ':(']
0.3334243085880641
['even', 'though', 'seen', 'whole', 'peep', 'show', 'multipl', 'time', 'noth', 'could', 'prepar', 'netflix', 'remov', 'seri', '1', '7', '...', ':(']
0.41403017522885355
["that'", 'said', '...', ':(']
0.32573599636147554
['way', 'get', 'bandana', ':(']
0.24967804192042373
['hi', 'oh', ':(', "let'", 'take', 'look', 'pleas', 'chat', 'us']
0.5287818963189211
['sad', 'moment', 'u', 'r', 'leav', 'two', 'day', ':(']
0.3646362938318978
[':-(', 'pleas', 'notic', 'mef']
0.27025283797729616
['rebound', 'run', 'away', 'alreadi', 'pooor', ':(']
0.36234707492895263
['lol', '2am', 'last', 'night', 'face', 'though', 'peopl', 'think', "i'm", 'perpetu', 'piss', 'sad', 'b', 'c', 'rest', 'bitchfac', ':(']
0.4546821297016783
[':-(', 'miss', 'fun']
0.2529046474358974
[':-(', 'pleas', 'notic', 'z']
0.14525283797729618
['peopl', 'wish', 'birthday', ':(']
0.37696217083498

['go', 'home', 'blue', ':-(', 'back', 'monday', 'hiby', 'social', 'action', 'plan', 'shareyoursumm']
0.47190865070482035
['pro', 'soccer', 'play', 'would', 'cool', ':(']
0.48840317314124787
['wish', 'b8', 'strong', ':(']
0.353872579267135
['hey', 'girl', 'must.b', 'dairi', 'produxt', 'want', "can't", 'lactos', 'intoler', ':(']
0.38617774932798443
['midland', 'ye', 'thank', 'depress', 'weather', 'forecast', 'word', 'rain', 'mention', 'sever', 'time', ':-(']
0.473227590143351
['cri', ':(']
0.07791424065987385
['old', 'one', 'day', '...', ':(']
0.40045809760400275
['liter', 'spent', 'day', 'yesterday', 'sleep', 'bed', "i'm", 'still', 'absolut', 'fuck', 'knacker', 'today', ':(']
0.3755106048746291
['new', 'sandra', 'bland', 'footag', 'realli', 'ice', 'cake', '..', 'heartbreak', 'drag', 'lifeless', 'bodi', 'polic', 'car', ':(']
0.2742013059035108
['knowww', ':(']
0.00013646288209606986
["we'r", 'sorri', 'feel', 'way', 'shell', ':(', 'regularli', 'review', 'price', 'offer', 'best']
0.3989937

In [43]:
error_in_pos = 0
for i in pos_values:
    if i>0.5:
        error_in_pos+=1

error_in_pos = 100 - error_in_pos/len(pos_values)*100
error_in_pos

9.0

In [56]:
error_in_neg = 0
for i in neg_values:
    if i<=0.5:
        error_in_neg+=1

error_in_neg = 100 - error_in_neg/len(neg_values)*100
error_in_neg

6.799999999999997