# Twitter sentiment presented by @Radras and @sekularacn

## All imports

In [214]:
import csv
import codecs
import numpy as np
import pandas as pd
import nltk
import re
import html
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import wordnet as wn

### Dataset loading function

In [215]:
def load_dataset(path_to_file):
    X, y = [], []
    with codecs.open(path_to_file, "r",encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader, None) # Skip header
        for row in reader:
            y.append(int(row[1]))
            X.append(row[2])
    return X, y

## Data importing

In [216]:
x_data, y_data = load_dataset('data/train.csv')

print('Len of data:', len(x_data), ' Len of labels:', len(y_data))

# Getting the random data from the dataset.
# 
indices = np.random.choice(range(99989), 10000, replace=False)

X_random_data = [x_data[i] for i in indices]
y_random_labels = [y_data[i] for i in indices]

Len of data: 99989  Len of labels: 99989


# Data cleaning

- Deleting mentions
- Deleting links
- Fixing the HTML symbols for unicode chars
- Removing punctation
- Lowercasing all
- Removing all the numeric values
- Removing stopwords
- Stemming

In [217]:
#Deleting links. Tokenizers don't work.
for i in range(len(X_random_data)):
    sentence = X_random_data[i].split(' ')
    new_sentence = []
    for word in sentence:
        if word[:4] != 'http':
            new_sentence.append(word)
    X_random_data[i] = ' '.join(new_sentence)

for i in range(len(X_random_data)):
    # Removing twitter mentions
    X_random_data[i] = re.sub(r'@\w+', "", X_random_data[i])
    
    # Removing HTML escaped symbols
    X_random_data[i] = html.unescape(X_random_data[i])

In [218]:
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from data.utils import *

# forms list of sentences per every tweet
all_words = [regexp_tokenize(sample, "[\w']+") for sample in X_random_data]

# removes words with more than 2 same chars, and converts to lowercase
for i in range(len(all_words)):
    for j in range(len(all_words[i])):
        all_words[i][j] = remove_more_than_two_duplicate_letters(all_words[i][j].lower())
print("finished")

print(all_words[:5])

finished
[['wait', 'your', 'hubby', 'is', 'at', 'the', 'show', 'and', 'your', 'not', 'why'], ['at', 'the', '1', 'u', 'just', 'mentioned', 'i', 'was', 'just', 'wait', 'till', 'the', 'wedding', 'speech', "i'll", 'b', 'uncontrollable'], ["can't", 'you', 'make', 'it'], ['and', 'what', 'about', 'you', 'lovely', 'lady', 'busy', 'still', 'up', 'so', 'much', 'on', 'your', 'plate', "when's", 'our', 'date'], ['looks', 'like', 'the', 'links', 'are', 'broken']]


### Cleaning numbers and stopwords

In [219]:
# clean the set from the numeric values and remove stopwords
for i in range(len(all_words)):
    all_words[i] = [word for word in all_words[i] if not word.isnumeric()]
    all_words[i] = [word for word in all_words[i] if word not in stopwords.words('english')]


In [220]:
for i in range(10):
    print(all_words[i])

['wait', 'hubby', 'show']
['u', 'mentioned', 'wait', 'till', 'wedding', 'speech', "i'll", 'b', 'uncontrollable']
["can't", 'make']
['lovely', 'lady', 'busy', 'still', 'much', 'plate', "when's", 'date']
['looks', 'like', 'links', 'broken']
['lj', 'sweetjamielee', 'livejournal', 'com', 'sth', 'like']
["that's", 'one', 'cool', 'collection']
['know', "i'm", 'excited', 'think', 'make', 'big', 'ejami', 'sign', 'find', 'lol']
['sorry', 'loss']
['freezing', 'buttocks']


### Stemming
Using PorterStemmer

In [221]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
for i in range(len(all_words)):
    for j in range(len(all_words[i])):
        all_words[i][j] = porter.stem(all_words[i][j])
        
for i in range(10):
    print(all_words[i])

['wait', 'hubbi', 'show']
['u', 'mention', 'wait', 'till', 'wed', 'speech', "i'll", 'b', 'uncontrol']
["can't", 'make']
['love', 'ladi', 'busi', 'still', 'much', 'plate', "when'", 'date']
['look', 'like', 'link', 'broken']
['lj', 'sweetjamiele', 'livejourn', 'com', 'sth', 'like']
["that'", 'one', 'cool', 'collect']
['know', "i'm", 'excit', 'think', 'make', 'big', 'ejami', 'sign', 'find', 'lol']
['sorri', 'loss']
['freez', 'buttock']


In [222]:
# Repositioning the sentences

clean_sentences = []
for lista in all_words:
    clean_sentences.append(" ".join(lista))

print(clean_sentences[:5])

['wait hubbi show', "u mention wait till wed speech i'll b uncontrol", "can't make", "love ladi busi still much plate when' date", 'look like link broken']


# Using Bag of Words model

In [223]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_sentences)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

# Training with Logistic Regression

In [224]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(train_data_features, y_random_labels, test_size=0.2, random_state=23)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_diff = y_pred == y_test

cnt = 0
for b in y_diff:
    if b:
        cnt += 1
print((cnt / len(y_diff)) * 100)

70.7


# Training with K-Nearest Neighbours

In [225]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=101)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_diff = y_pred == y_test

cnt = 0
for qwe in y_diff:
    if qwe:
        cnt += 1
print(cnt/len(y_diff))

0.6505


## Dividing dataset into train, val and test

In [226]:
X_train, X_test, y_train, y_test = train_test_split(train_data_features, np.array(y_random_labels), test_size=0.5, random_state=23)

X_test, X_validate, y_test, y_validate = train_test_split(X_test, y_test, test_size=0.5, random_state=23)

print('X_train shape:', len(X_train))
print('X_validate shape:', len(X_validate))
print('X_test shape:', len(X_test))


X_train shape: 5000
X_validate shape: 2500
X_test shape: 2500


# Logistic regression with TensorFlow

In [234]:
# Defining parameters
LEARNING_RATE = 0.5
TRAINING_EPOCHS = 5000
REGULARIZATION_PARAM = tf.constant(0.01)
# CRITICAL::: X_TRAIN[0] MUST EXIST FOR THIS TO WORK PROPERLY
N_FEATURES = len(X_train[0])

y_train = y_train.reshape(len(y_train), 1)


In [235]:
X = tf.placeholder(tf.float32, [None, N_FEATURES], name="X")
y = tf.placeholder(tf.float32, [None, 1], name="y")

Weights = tf.Variable(tf.random_normal([N_FEATURES, 1]), name="Weights")
bias = tf.Variable(tf.random_normal([1, 1]), name="bias")

In [236]:
# Model
hypotesis = tf.sigmoid(bias + tf.matmul(X, Weights))

## Cost function & L2 regularization

In [237]:
# Cost function
cost = tf.reduce_mean(-tf.multiply(y, tf.log(hypotesis)) - 
                      tf.multiply(tf.subtract(1.0, y), tf.log(tf.subtract(1.0, hypotesis)))
                      + tf.multiply(REGULARIZATION_PARAM, Weights * Weights))

In [238]:
train_op = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cost)

In [239]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(TRAINING_EPOCHS):
        _, err = sess.run([train_op, cost], {X: X_train, y: y_train})
        if epoch % 100 == 0:
            print(epoch, err)
    W_computed, b_computed = sess.run([Weights, bias])
        

0 2.32106
100 0.999278
200 0.926831
300 0.873038
400 0.830877
500 0.796437
600 0.767451
700 0.742514
800 0.720699
900 0.701355
1000 0.68401
1100 0.668315
1200 0.654005
1300 0.64087
1400 0.628747
1500 0.617502
1600 0.607025
1700 0.597228
1800 0.588034
1900 0.579379
2000 0.571207
2100 0.563473
2200 0.556136
2300 0.54916
2400 0.542515
2500 0.536172
2600 0.530109
2700 0.524303
2800 0.518737
2900 0.513393
3000 0.508255
3100 0.503312
3200 0.498549
3300 0.493957
3400 0.489524
3500 0.485243
3600 0.481104
3700 0.477101
3800 0.473225
3900 0.469471
4000 0.465832
4100 0.462303
4200 0.45888
4300 0.455556
4400 0.452326
4500 0.449188
4600 0.446137
4700 0.443169
4800 0.44028
4900 0.437467


## Predicting and testing for accuracy

In [240]:
#hypotesis = tf.sigmoid(bias + tf.matmul(X, Weights))
h = tf.sigmoid(b_computed + tf.matmul(X, W_computed))

cnt = 0

with tf.Session() as sess:
    y_predict = sess.run(h, {X: X_test})
    y_predict_predict = []
    
    for value in y_predict:
        if np.mean(value) >= 0.5:
            y_predict_predict.append(1)
        else:
            y_predict_predict.append(0)
    
    y_diff = y_predict_predict == y_test
    
    for value in y_diff:
        if value:
            cnt += 1
    
    print('Accuracy:', cnt/(len(y_diff)) * 100, '%')

Accuracy: 66.56 %
