### Implement Logistic Classification for classifying tweets / text
Given a tweet we will have to decide whether a tweet is positive and negative

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import twitter_samples

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load and Analyse the dataset

In [4]:
# load positive tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
positive_tweets[:3]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!']

In [5]:
# load negative tweets
negative_tweets = twitter_samples.strings('negative_tweets.json')
negative_tweets[:3]

['hopeless for tmr :(',
 "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(']

In [6]:
## total number of pos and neg tweets

print(f"Total No. of Positive tweets: {len(positive_tweets)}")
print(f'Total No. of Negative tweets: {len(negative_tweets)}')

Total No. of Positive tweets: 5000
Total No. of Negative tweets: 5000


In [7]:
## generate a train and test dataset with equal combination of pos and neg tweets
## in total 1000 words, dividing the list of tweets into 8000 train and 2000 test datasets.

train_pos = positive_tweets[:4000]
train_neg = negative_tweets[:4000]

test_pos = positive_tweets[4000:]
test_neg = negative_tweets[4000:]

# combining all of them together

train_data = train_pos + train_neg
test_data = test_pos + test_neg

print(f'Total number of data count train data: {len(train_data)} and test data : {len(test_data)}')

Total number of data count train data: 8000 and test data : 2000


In [9]:
# creating labels for the datasets
train_label = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_label = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)

print(f'Shape of Train and Test labels : {train_label.shape} and {test_label.shape}')

Shape of Train and Test labels : (8000, 1) and (2000, 1)


### Processing of the data to create word frequencies list

In [11]:
from nltk.corpus import stopwords
import re

def clean_tweet(tweet):
    '''
        clean the tweet to tokenise, remove stop words and stem the words
    '''
    stop_words = stopwords.words('english')
    #print(f'Total stop words in the vocab: {len(stop_words)}')
    
    tweet = re.sub(r'#','',tweet) ## remove the # symbol
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet) ## remove any hyperlinks
    tweet = re.sub(r'^RT[\s]+','',tweet) ## remove any Retweets (RT)
    
    tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_token = tokenizer.tokenize(tweet)
    
    tweet_cleaned = []
    
    for word in tweet_token:
        if word not in stop_words:
            tweet_cleaned.append(word)
            
    return tweet_cleaned
    

def build_tweet_frequency(tweets, label):
    '''
        Build a vocab of tweet word frequencies across corpus. 
        @input: Tweets - list of tweets
                label - Array of tweet sentiments
        @output: a dict of (word, label):frequency
    '''
    label_list = np.squeeze(label).tolist()
    
    freq = {}
    
    for t, l in zip(tweets, label_list):
        for word in clean_tweet(t):
            word_pair = (word,l)
            
            if word_pair in freq:
                freq[word_pair] +=1
            else:
                freq[word_pair] =1

    return freq
    

In [12]:
train_data[0] ## 0, 500

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [13]:
clean_tweet(train_data[0])

['followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']

In [14]:
tweet_freq_vocab = build_tweet_frequency(train_data, train_label)

In [15]:
tweet_freq_vocab.get(('sad',0))

100

In [21]:
def extract_features(tweet, vocab):
    '''
        Given a tweet and frequency vocab, generate a list of 
        @input: 
            tweet - tweet we want to extract features from
            vocab - frequency vocab dictionary
        @output:
            tweet_feature - a numpy array with [label, total_pos_freq, total_neg_freq]
    '''
    cleaned_tweet = clean_tweet(tweet)
    #print(cleaned_tweet)
    tweet_feature = np.zeros((1,3))
    
    tweet_feature[0,0] = 1
    
    for words in cleaned_tweet: # iterate over the tweet to get the number of pos and neg tweet freqs
        #print(vocab.get((words,1.0),0), " --- ", vocab.get((words,0.0),0))
        tweet_feature[0,1] += vocab.get((words,1.0),0)
        tweet_feature[0,2] += vocab.get((words,0.0),0)
    
    return tweet_feature

In [22]:
extract_features(train_data[0],tweet_freq_vocab)

array([[1.000e+00, 3.003e+03, 4.900e+01]])

In [23]:
extract_features('Hi How are you? I am doing good', tweet_freq_vocab)

array([[  1., 805., 621.]])

### Model Training

In [24]:
## Generate the vector word frequency for all of the training tweets

train_X = np.zeros((len(train_data),3))
for i in range(len(train_data)):
    train_X[i,:] = extract_features(train_data[i], tweet_freq_vocab)

train_y = train_label

test_X = np.zeros((len(test_data),3))
for i in range(len(test_data)):
    test_X[i,:] = extract_features(test_data[i], tweet_freq_vocab)
    
test_y = test_label

In [25]:
train_X[0:5]

array([[1.000e+00, 3.003e+03, 4.900e+01],
       [1.000e+00, 6.280e+03, 1.724e+03],
       [1.000e+00, 5.906e+03, 2.168e+03],
       [1.000e+00, 2.862e+03, 4.000e+00],
       [1.000e+00, 7.326e+03, 2.127e+03]])

In [26]:
train_y.shape

(8000, 1)

In [27]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(train_X, train_y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
predictions = model.predict(test_X)

In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(test_y, predictions)

0.9875

In [30]:
from sklearn.metrics import classification_report

print(classification_report(test_y,predictions))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1000
         1.0       0.99      0.99      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



### Making your own predictions

In [31]:
my_tweet1 = 'i liked my prediction score. happy with the results'
model.predict(extract_features(my_tweet1,tweet_freq_vocab))

array([1.])

In [32]:
my_tweet2 = 'i am sad with the result of the football match'
model.predict(extract_features(my_tweet2,tweet_freq_vocab))

array([0.])

In [33]:
my_tweet3 = 'shame that i couldnt get an entry to the competition'
model.predict(extract_features(my_tweet3,tweet_freq_vocab))

array([0.])

In [34]:
my_tweet3 = 'this movie should have been great.'
model.predict(extract_features(my_tweet3,tweet_freq_vocab)) ## misclassified example

array([1.])

In [35]:
my_tweet3 = 'i liked my prediction score. not happy with the results'
model.predict(extract_features(my_tweet3,tweet_freq_vocab))

array([1.])