# Preparing Data

In [1]:
import csv


def get_list(file):
    with open(f'{file}.csv', newline='', encoding='utf-8') as f:
        f = csv.reader(f)
        x = list(f)

    ls = []
    for i in x:
        ls.append(''.join(i))
    return ls

In [2]:
all_positives = get_list('pos')
all_negatives = get_list('neg')
stopwords = get_list('stopwords')

In [3]:
print('Number of positive statements:', len(all_positives))
print('Number of negative statements:', len(all_negatives))

print('\nThe type of all_positives is:', type(all_positives))
print('The type of a tweet entry is:', type(all_negatives[0]))

Number of positive tweets: 6000
Number of negative tweets: 6000

The type of all_positive_tweets is: <class 'list'>
The type of a tweet entry is: <class 'str'>


# Splitting Data
#### Train : Test :: 80 : 20

In [4]:
n = int(len(all_positives)*0.8)
test_pos = all_positives[n:]
train_pos = all_positives[:n]
test_neg = all_negatives[n:]
train_neg = all_negatives[:n]

In [5]:
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [6]:
import re
import string
import numpy as np

from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_statements(statement):
    """Process statement function.
    Input:
        statement: a string containing a statement
    Output:
        statements_clean: a list of words containing the processed statement

    """
    stemmer = PorterStemmer()
    
    # remove stock market tickers like $GE
    statement = re.sub(r'\$\w*', '', statement)
    
    # remove old style retweet text "RT"
    statement = re.sub(r'^RT[\s]+', '', statement)
    
    # remove hyperlinks
    statement = re.sub(r'https?:\/\/.*[\r\n]*', '', statement)
    
    # remove hashtags
    # only removing the hash # sign from the word
    statement = re.sub(r'#', '', statement)
    
    # tokenize statements
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    statement_tokens = tokenizer.tokenize(statement)

    statements_clean = []
    for word in statement_tokens:
        if (word not in stopwords and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # statements_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            statements_clean.append(stem_word)

    return statements_clean

In [7]:
def get_freqs(data):
    """Process statement function.
    Input:
        data: an array of strings containing a statement
    Output:
        freq_dict: a dictionary of words representing their frequencies

    """
    freq_dict = {}
    for i in data:
        for word in process_statements(i):
            if word in freq_dict:
                freq_dict[word]+=1
            else:
                freq_dict[word]=1
    return freq_dict

In [8]:
dict_of_positives = get_freqs(train_pos)
dict_of_negatives = get_freqs(train_neg)

In [9]:
pos = dict_of_positives.keys()
neg = dict_of_negatives.keys()

res = pos | neg
len(res)

39004

In [10]:
resultant_dict = {}
for i in res:
    pos_value = dict_of_positives.get(i,0)
    neg_value = dict_of_negatives.get(i,1.5)
    x = pos_value / (pos_value+neg_value)
    
    resultant_dict[i] = x

In [11]:
import math


def check(s):
    cleaned = process_statements(s)
    
    arr = []
    for i in cleaned:
        x = resultant_dict.get(i, 0.5)
        
        arr.append(x)
        
    # print(cleaned)
    # print(np.mean(arr))
    x = np.mean(arr)
    
    return x if not math.isnan(x) else 0.55

# Saving the data

In [12]:
import pickle

In [13]:
def save_data(file, data):
    with open(file, 'wb') as f:
        pickle.dump(data, f)

In [14]:
def load_data(file):
    with open(file, 'rb') as f:
        return pickle.load(f)

In [15]:
file = 'model_1.pkl'
save_data(file, resultant_dict)

In [16]:
load_data(file)

{"cisco'": 0.0,
 'sawalha': 0.7272727272727273,
 "human'": 0.5714285714285714,
 'review': 0.4967741935483871,
 'pond': 0.6,
 'braga': 0.5714285714285714,
 'heh': 0.5,
 'scrooo': 0.0,
 'round-the-world': 0.0,
 'neon-lit': 0.5,
 'watcher': 0.14285714285714285,
 'atm': 0.6666666666666666,
 'kashi': 0.4,
 'semler': 0.0,
 'reassur': 0.8,
 'pickl': 0.0,
 "smarter-than-you'd": 0.0,
 'truman': 0.9166666666666666,
 'viber': 0.0,
 'rattner': 0.4,
 'pulitzer-pr': 0.4,
 'awww': 0.35294117647058826,
 'monosyllab': 0.0,
 'malko-visit': 0.4,
 'saccharin': 0.2,
 'mark': 0.5423728813559322,
 'mould': 0.4,
 'scof': 0.5714285714285714,
 'creepier': 0.25,
 'dope-d': 0.4,
 'footprint': 0.1111111111111111,
 '_dead_': 0.0,
 "judah'": 0.0,
 'archetyp': 0.7058823529411765,
 'fly-bi': 0.4,
 'moos': 0.4,
 'itong': 0.0,
 'stupid': 0.16776315789473684,
 'tighter': 0.625,
 'soft-ey': 0.4,
 'brad': 0.6065573770491803,
 'judo': 0.4,
 'absent-minded': 0.0,
 'fcking': 0.0,
 'farrel': 0.14285714285714285,
 '27': 0.5,
 '

# Error Analysis

In [17]:
pos_values = []
for i in test_pos:
    val = check(i)
    pos_values.append(val)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [18]:
error_in_pos = 0
for i in pos_values:
    if i<=0.5:
        error_in_pos+=1

error_in_pos = error_in_pos/len(pos_values)*100

In [19]:
neg_values = []
for i in test_neg:
    val = check(i)
    neg_values.append(val)

In [20]:
error_in_neg = 0
for i in neg_values:
    if i>0.5:
        error_in_neg+=1

error_in_neg = error_in_neg/len(neg_values)*100

In [21]:
print(f'Error in positive statements is {error_in_pos}%.')
print(f'Error in positive statements is {error_in_neg}%.')

Error in positive statements is 5.25%.
Error in positive statements is 2.75%.


In [22]:
check('this movie has no value addition to the person')

0.4984993645664779

In [23]:
check('''richard gere can be a commanding actor , but he's not always in great films . 
everything comes together here . 
gere is a big time chicago defense attorney who takes on a seemingly unwinable case in hopes of even more publicity . 
it doesn't go exactly as he expects . 
gere's client , aaron ( edward norton ) , is a shy stuttering tennessee boy who is accused of brutally murdering and mutilating a catholic archbishop . 
the evidence is stacked against him . 
he was caught running from the scene covered in the bishop's blood . 
his bloody footprints are all over the murder scene . 
he has a relationship with the priest . 
gere talks to the boy , believes that he is actually innocent and sets about finding the real killer . 
despite the lawyer's proclamations that he doesn't care about the guilt of his clients and that the real thrill is gambling with people's lives , he becomes involved with aaron and is determined to free him . 
lots of complications and twists . 
the prosecuting attorney is gere's former co-worker and lover . 
they both work each other's motives to their legal advantages and it gets messy . 
her boss had major economic dealings with the archbishop that went sour and seems to have crime connections . 
aaron gets weirder and weirder as the trial goes on . 
gere's case is falling apart and he is faced with about a dozen ethical dilemmas . 
gere is exceptional as the well-dressed reserved counselor , but just once , i wanted to see him kick back and come out of his " suit " persona . 
even when he loses it , you don't see very far inside . 
norton's aaron is convincing : he comes across as the backwoods kid misplaced in the big city . 
the supporting cast does a fine job of holding together the story . 
as with most of the effective courtroom dramas , the cinematography is crisp and rich . 
the story will keep you on the edge of your seat . 
nothing is what it seems . 
''')

0.5524442715028056

In [24]:
check(''' " quest for camelot " is warner bros . ' first feature-length , fully-animated attempt to steal clout from disney's cartoon empire , but the mouse has no reason to be worried . 
the only other recent challenger to their throne was last fall's promising , if flawed , 20th century fox production " anastasia , " but disney's " hercules , " with its lively cast and colorful palate , had her beat hands-down when it came time to crown 1997's best piece of animation . 
this year , it's no contest , as " quest for camelot " is pretty much dead on arrival . 
even the magic kingdom at its most mediocre -- that'd be " pocahontas " for those of you keeping score -- isn't nearly as dull as this . 
the story revolves around the adventures of free-spirited kayley ( voiced by jessalyn gilsig ) , the early-teen daughter of a belated knight from king arthur's round table . 
kayley's only dream is to follow in her father's footsteps , and she gets her chance when evil warlord ruber ( gary oldman ) , an ex-round table member-gone-bad , steals arthur's magical sword excalibur and accidentally loses it in a dangerous , booby-trapped forest . 
with the help of hunky , blind timberland-dweller garrett ( carey elwes ) and a two-headed dragon ( eric idle and don rickles ) that's always arguing with itself , kayley just might be able to break the medieval sexist mold and prove her worth as a fighter on arthur's side . 
 " quest for camelot " is missing pure showmanship , an essential element if it's ever expected to climb to the high ranks of disney . 
there's nothing here that differentiates " quest " from something you'd see on any given saturday morning cartoon -- subpar animation , instantly forgettable songs , poorly-integrated computerized footage . 
 ( compare kayley and garrett's run-in with the angry ogre to herc's battle with the hydra . 
i rest my case . ) 
even the characters stink -- none of them are remotely interesting , so much that the film becomes a race to see which one can out-bland the others . 
in the end , it's a tie -- they all win . 
that dragon's comedy shtick is awfully cloying , but at least it shows signs of a pulse . 
at least fans of the early-'90s tgif television line-up will be thrilled to find jaleel " urkel " white and bronson " balki " pinchot sharing the same footage . 
a few scenes are nicely realized ( though i'm at a loss to recall enough to be specific ) , and the actors providing the voice talent are enthusiastic ( though most are paired up with singers who don't sound a thing like them for their big musical moments -- jane seymour and celine dion ? ? ? ) . 
but one must strain through too much of this mess to find the good . 
aside from the fact that children will probably be as bored watching this as adults , " quest for camelot " 's most grievous error is its complete lack of personality . 
and personality , we learn from this mess , goes a very long way . 
''')

0.4699152824073624

In [25]:
check('not')

0.55