In [None]:
# get dataset from GitHub
!git clone https://github.com/marciovai/Twitter-Sentiment-10K.git

Cloning into 'Twitter-Sentiment-10K'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 7 (delta 1), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (7/7), done.


In [None]:
# import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss, accuracy_score
import joblib
import re

In [None]:
# load data
data = pd.read_csv("Twitter-Sentiment-10K/tweet_sentiment_10K.csv") 

In [None]:
data.head()

Unnamed: 0,id,date,text,target
0,0,2009-06-20,My concealer just broke!,0
1,1,2009-06-21,Mommy leaves soon,0
2,2,2009-06-03,@Boogaloo1 Not here it ain't! I've got an epi...,0
3,3,2009-06-16,I want my Blackberry back!!,0
4,4,2009-06-16,@Veganluke I can't even skype still dont have...,0


In [None]:
# load list of stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# save stopwords in a Python set
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# separate data into train and test sets
pos = data[data['target']==1]
neg = data[data['target']==0]

# tweet corpus is equally split in 8000 train, 2000 test
train_neg = neg[0:4000]
train_pos = pos[0:4000]

test_neg = neg[4000:]
test_pos = pos[4000:]

train = train_neg.append(train_pos)
test = test_neg.append(test_pos)

del pos, neg, train_pos, train_neg, test_pos, test_neg

In [None]:
def process_tweet(tweet, stop_words):

  # remove URLs from the tweet
  tweet = re.sub(r"http\S+", "", tweet)

  # remove punctuation
  tweet = re.sub(r'[^\w\s]','', tweet)

  # tokenize tweet (transform from string into list of word)
  tweet = word_tokenize(tweet) 

  # remove stop words using the set() imported from NTLK
  tweet = [word for word in tweet if word not in stop_words]

  # apply stemming on the tweet
  ps = PorterStemmer() 
  tweet = [ps.stem(word) for word in tweet]

  # lowercase all words in the tweet
  tweet = [word.lower() for word in tweet]

  return tweet

In [None]:
# dict to store processed tweets
train_tweets_dict = {}

# iterate over each train row, call process(tweet) and save it on tweets_dict
for index, row in train.iterrows():
  tweet_id = row['id']
  tweet = row['text']
  tweet = process_tweet(tweet, stop_words)
  train_tweets_dict.update({tweet_id:tweet})

In [None]:
# create dictionary of word frequencies on positive and negative tweets 

# get vocabulary for unique all words in the tweets
word_vocab = []
for id, tweet in train_tweets_dict.items():
  word_vocab = word_vocab + tweet

# remove duplicate words by creating a set from the initial list
word_vocab = set(word_vocab)

# create dict out of vocab in the format of {(word, 0): count, (word, 1): count}
# to get the counts of words appearance in both positive and negative tweets
word_vocab_dict = {}
for word in word_vocab:
  word_vocab_dict[(word, 0)] = 0
  word_vocab_dict[(word, 1)] = 0

# compute word counts across all tweets and store in word_vocab_dict
for id, tweet in train_tweets_dict.items():
  tweet_label = train[train['id']==id].target.values[0]
  for word in tweet:
    word_vocab_dict[(word, tweet_label)] += 1

In [None]:
def get_tweet_word_frequencies(word_vocab_dict, tweet_text):
  # initialize total frequency variables
  pos_total_freq = 0
  neg_total_freq = 0

  for word in tweet_text:
    # for each word in the tweet, get its positive and negative frequency
    neg_freq = word_vocab_dict.get((word, 0), 0)
    pos_freq = word_vocab_dict.get((word, 1), 0)

    # sum positive and negative frequencies of current word to running total
    neg_total_freq+=neg_freq
    pos_total_freq+=pos_freq

  return [neg_total_freq, pos_total_freq]

In [None]:
# call get_tweet_word_frequencies() and store results in a DataFrame
train_df = pd.DataFrame()
for id, tweet in train_tweets_dict.items():
  tweet_features = get_tweet_word_frequencies(word_vocab_dict, tweet)
  label = train[train['id']==id].target.values[0]
  train_df = train_df.append({'id': id, 
                   'neg_freq': tweet_features[0], 
                   'pos_freq':tweet_features[1],
                   'target': label}, ignore_index=True)

In [None]:
train_df.head()

Unnamed: 0,id,neg_freq,pos_freq,target
0,0.0,113.0,62.0,0.0
1,1.0,94.0,64.0,0.0
2,2.0,453.0,374.0,0.0
3,3.0,1825.0,1175.0,0.0
4,4.0,2202.0,1334.0,0.0


In [None]:
# get the dataset X and Y and call fit() on LinearRegression
train_x = train_df[['neg_freq', 'pos_freq']].values
train_y = train_df[['target']].values
model = LinearRegression()
model.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
# prepare test data

# dict to store processed tweets
test_tweets_dict = {}

# iterate over each train row, call process(tweet) and save it on tweets_dict
for index, row in test.iterrows():
  tweet_id = row['id']
  tweet = row['text']
  tweet = process_tweet(tweet, stop_words)
  test_tweets_dict.update({tweet_id:tweet})

# call get_tweet_word_frequencies() and store results in a DataFrame
test_df = pd.DataFrame()
for id, tweet in test_tweets_dict.items():
  tweet_features = get_tweet_word_frequencies(word_vocab_dict, tweet)
  label = test[test['id']==id].target.values[0]
  test_df = test_df.append({'id': id, 
                   'neg_freq': tweet_features[0], 
                   'pos_freq':tweet_features[1],
                   'target': label}, ignore_index=True)

In [None]:
test_x = test_df[['neg_freq', 'pos_freq']].values
test_y = test_df[['target']].values

test_pred = model.predict(test_x)

print("Log loss: {}".format(log_loss(test_y, test_pred)))

print("Accuracy: {}".format(accuracy_score(test_y, test_pred.round())))

Log loss: 0.687854458360648
Accuracy: 0.649


In [None]:
# save model on a serialized file with joblib
joblib.dump(model, 'tweet_sentiment_logistic_v1.joblib')

# save vocabulary dictionary on a serialized file with joblib
joblib.dump(word_vocab_dict, 'word_vocab_dict_v1.joblib')

['word_vocab_dict_v1.joblib']

In [None]:
### TEST PREDICT ###

predict_data = [test.text.values[0], test.text.values[1]]

# dict to store processed tweets
predict_tweets_list = []

# iterate over each row, call process(tweet) and save it on predict_tweets_list
for tweet in predict_data:
  tweet = process_tweet(tweet, stop_words)
  predict_tweets_list.append(tweet)

# call get_tweet_word_frequencies() and store results in a DataFrame
predict_ar = np.array([])
for tweet in predict_tweets_list:
  tweet_features = get_tweet_word_frequencies(word_vocab_dict, tweet)
  predict_ar = np.concatenate((predict_ar, np.array(tweet_features)))

# making sure our data is shape (tweet_features, tweets)
predict_ar = predict_ar.reshape(len(predict_tweets_list), 2)

In [None]:
test_pred = model.predict(predict_ar)

In [None]:
test_pred

array([[0.41133504],
       [0.39934264]])

In [None]:
prediction = (test_pred>= 0.5).astype(int)
prediction.ravel()

array([0, 0])

In [None]:
predict_tweets_list

[['ohhhhhhhhhhh', 'go', 'away', 'rain', 'pleas', 'go', 'away'],
 ['that',
  'stuff',
  'strongita',
  'make',
  'feel',
  'sick',
  'for',
  'sure',
  'go',
  'work',
  'tomorrow']]

In [None]:
from google.colab import files
files.download('tweet_sentiment_logistic_v1.joblib')
files.download('word_vocab_dict_v1.joblib') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>