In [1]:
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')
from nltk.corpus import stopwords, twitter_samples

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [3]:
import numpy as np

train_pos = pos_tweets[:4000]
test_pos = pos_tweets[4000:]
train_neg = neg_tweets[:4000]
test_neg = neg_tweets[4000:]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [4]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=process_tweet)
train_x_vecs = vectorizer.fit_transform(train_x).toarray()

In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_x_vecs, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
test_x_vecs = vectorizer.transform(test_x)
preds = model.predict(test_x_vecs)

In [8]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print(f'accuracy={accuracy_score(test_y, preds)}')
print(f'f1={f1_score(test_y, preds)}')
print(f'confusion={confusion_matrix(test_y, preds)}')

accuracy=0.995
f1=0.9950248756218906
confusion=[[ 990   10]
 [   0 1000]]


In [10]:
train_x_vecs.shape

(8000, 9084)

In [65]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

class LogisticRegressor:
    def __init__(self, num_iter=500, learning_rate=0.5):
        self.learning_rate = learning_rate
        self.weights = None
        self.bias = None
        self.num_classes = 0
        self.num_features = 0
        self.num_samples = 0
        self.num_iter = num_iter
    
    def fit(self, X, y):
        encoder = OneHotEncoder(sparse=False)
        labels = encoder.fit_transform(y.reshape(-1, 1))
        self.num_classes = len(encoder.categories_[0])
        self.num_samples, self.num_features = X.shape
        self.weights = np.zeros((self.num_classes, self.num_features))
        self.bias = np.random.rand(self.num_classes, 1)

        for i in range(self.num_iter):
            probs = self.predict_proba(X)
            loss = -(1/self.num_samples) * np.sum(np.log(probs * labels, where=labels.astype(bool)))
            print(f'iter={i}, loss={loss}')
            diff = np.transpose(probs-labels)
            grad_weights = (1/self.num_samples) * np.dot(diff, X)
            grad_bias = (1/self.num_samples) * np.dot(diff, np.ones((self.num_samples, 1)))
            self.weights = self.weights - self.learning_rate * grad_weights
            self.bias = self.bias - self.learning_rate * grad_bias
    
    def predict_proba(self, X):
        z = np.dot(X, self.weights.T) + self.bias.reshape((1, -1))
        z_exp = np.exp(z)
        z_exp_sum = np.sum(z_exp, axis=1, keepdims=True)
        probs = z_exp / z_exp_sum
        return probs

    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)

In [70]:
model = LogisticRegressor()
model.fit(train_x_vecs, train_y)

iter=0, loss=0.3536228846399047
iter=1, loss=-0.683467173638389
iter=2, loss=-0.6546891192667154
iter=3, loss=-0.6302479779293043
iter=4, loss=-0.6089808202136238
iter=5, loss=-0.5901602727930392


KeyboardInterrupt: ignored

In [15]:
train_y

array([1., 1., 1., ..., 0., 0., 0.])

In [71]:
test_x_vecs = vectorizer.transform(test_x).toarray()
preds = model.predict(test_x_vecs)

In [72]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print(f'accuracy={accuracy_score(test_y, preds)}')
print(f'f1={f1_score(test_y, preds)}')
print(f'confusion={confusion_matrix(test_y, preds)}')

accuracy=0.9945
f1=0.9944695827048768
confusion=[[1000    0]
 [  11  989]]
