In [1]:
import pandas as pd
import numpy as np
import os
import random
import re

### Path to train and test data

In [2]:
test_path = 'data/balanced_test_set.csv'
train_path = 'data/train_set.csv'

In [3]:
train_data = pd.read_csv(train_path, delimiter=',', quotechar='|', encoding="utf8")
train_data.head()

Unnamed: 0,Tweet,Sentiment
0,<ID> <ID> Då missförstår du mig Hålla låg pro...,0
1,<ID> om drivkraften bara är pengar så förstår jag,0
2,<ID> <ID> Här kan ni köpa in er i mitt andels...,0
3,Föredömligt av @FolksamMedia att inte invester...,1
4,Rutger Arnhults privata bolag emitterar obliga...,0


In [4]:
test_data = pd.read_csv(test_path, delimiter=',', quotechar='|', encoding="utf8")
test_data.head()

Unnamed: 0,Tweet,Sentiment
0,<ID> <ID> jaaa med Calle och Hobbe p,1
1,ÅH VAD JAG ÄLSKAR ATT VARA KVINNA,1
2,Drygt en vecka kvar rösta i kyrkovalet gärna p...,1
3,En lite bra tråd om våld och nazister och sånt...,1
4,På väg hem efter en fantastiskt trevlig AW med...,1


## Sampling
### Do we want to under-sample the majority classes, creating a balanced training set where randomly selected tweets from the majority classes are extracted to match the minority class? Or do we want to under-sample the majority class(es) and over-sample the minority class(es)

#### -1 = negative  |  0 = neutral  |  1 = positive

In [5]:
train_data.groupby('Sentiment').count()

Unnamed: 0_level_0,Tweet
Sentiment,Unnamed: 1_level_1
-1,2551
0,5820
1,2703


In [6]:
# ---- Enter True / False ----
# ---- if False, new varible must be larger than smallest and smaller than largest in table above. -----
only_under_sample = False
no_sampling = False # use ALL training data without any balancing.

new = 3100

if only_under_sample:
    print('Under-sampling, only.')
else:
    print('Under- and over-sampling to create a balanced training set with {} tweets in each class'.format(new))

Under- and over-sampling to create a balanced training set with 3100 tweets in each class


In [7]:
pos = []
neg = []
neu = []
for tweet_tuple in train_data.iterrows():
    tweet = tweet_tuple[1]['Tweet']
    sentiment = int(tweet_tuple[1]['Sentiment'])

    if sentiment == -1:
        neg.append([tweet, sentiment])

    if sentiment == 0:
        neu.append([tweet, sentiment])

    if sentiment == 1:
        pos.append([tweet, sentiment])
            
len_neu = len(neu)
len_neg = len(neg)
len_pos = len(pos)
            
#print(len_neu, ' Neutral')
#print(len_neg, ' Negative')
#print(len_pos, ' Positive')

minimi = min(len_neg, len_neu, len_pos)

if only_under_sample and not no_sampling:
    print('Under-sampling a balanced training data set')
    
    print('Smallest class size: ', minimi)

    np.random.shuffle(pos)
    np.random.shuffle(neg)
    np.random.shuffle(neu)
    
    print('Shuffling new dataset...')
    balanced_data = np.vstack((pos[:minimi], neu[:minimi], neg[:minimi]))

    np.random.shuffle(balanced_data)
    print('Size of training set, with {} in each class:'.format(minimi))
    print(len(balanced_data))
    
elif not only_under_sample and not no_sampling:
    print('Under- and over-sampling a balanced training data set')
    
    remaining_pos = new - len(pos)
    remaining_neg = new - len(neg)

    pos_copy = pos.copy()
    neg_copy = neg.copy()

    upsampled_pos = []
    upsampled_neg = []


    for i in range(remaining_pos):
        rand_id = random.randint(0, len(pos_copy) - 1)
        upsampled_pos.append(pos_copy.pop(rand_id))

    for i in range(remaining_neg):
        rand_id = random.randint(0, len(neg_copy) - 1)
        upsampled_neg.append(neg_copy.pop(rand_id))
        
    pos = np.vstack((pos, upsampled_pos))
    neg = np.vstack((neg, upsampled_neg))

    np.random.shuffle(pos)
    np.random.shuffle(neg)
    np.random.shuffle(neu)
    
    # Note that this line must be changed if the distribution of the training data shifts.
    # It now assumes that pos and neg is over-sampled and that neu is under-sampled!
    balanced_data = np.vstack((pos, neu[:new], neg))
    
    print('Shuffling new dataset...')
    np.random.shuffle(balanced_data)
    print('Size of training set, with {} in each class:'.format(new))
    print(len(balanced_data))
    
elif not only_under_sample and no_sampling:
    balanced_data = np.vstack((pos, neu, neg))
    print('Including ALL available training data <-- unbalanced')
    np.random.shuffle(balanced_data)
    print('size:')
    print(len(balanced_data))

Under- and over-sampling a balanced training data set
Shuffling new dataset...
Size of training set, with 3100 in each class:
9300


### Separate tweets from labels

In [8]:
train_X = [pair[0] for pair in balanced_data]
train_Y = [int(pair[1]) for pair in balanced_data]

test_X = test_data['Tweet'].tolist()
test_Y = test_data['Sentiment'].tolist()

### Helper methods

In [9]:
def reduce_lengthening(text):
    '''
    An ad-hoc spellcheck. No swedish words contains more than two identical characters
    in a sequence. This method finds those and shortens them to two - irregardless of
    whether it's a noun or whatever. ALL occurrences are reduced to only two in order to
    ensure words that are commonly emphazised, i.e såååååå, are included in the vocabulary.
    '''
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

### Clean and pre-process each tweet. Stemming and stop-word removal is removed from pre-processing stage after empirical findings.

In [10]:
def cleanTweet(tweet):
    
    # Here we have each tweet. Process and add to file.

    # Replaces various URLs with "<URL>"
    filtered = re.sub(r"http\S+", " <URL>", tweet)

    # Replace mentions with anonymous <ID>
    filtered = re.sub(r"@\S+", " <ID>", filtered)

    # Replace haschtags with anonymous <HASCHTAG>
    filtered = re.sub(r"#\S+", " <HASCHTAG>", filtered)

    filtered = reduce_lengthening(filtered)

    # Replace each '-' and '/' with ' - ' and ' / ' because they are common in text
    filtered = filtered.replace("-", " - ")
    filtered = filtered.replace("/", " / ")

    filtered = re.sub('[\'&]+', '', filtered)

    #Remove non-alpha numerical and <> from tweet
    filtered = re.sub('[^0-9a-zA-Z åäöÅÄÖ<>]+', ' ', filtered)

    prev = ''
    sentence = ''
    for word in filtered.split():
        word = word.strip().lower()

        if not (word == prev and (word == '<id>' or word == '<haschtag>' or word == '<url>')) and not word == 'amp':
            sentence = sentence + ' ' + word
        prev = word

    return sentence


train_cleaned_tweets = []
for tweet in train_X:
    tweet = cleanTweet(tweet)
    #print(tweet)
    train_cleaned_tweets.append(tweet)
    
test_cleaned_tweets = []
for tweet in test_X:
    tweet = cleanTweet(tweet)
    #print(tweet)
    test_cleaned_tweets.append(tweet)

### Ensure alll dimensions add up

In [11]:
print(len(train_cleaned_tweets), len(train_Y))
print(len(test_cleaned_tweets), len(test_Y))

9300 9300
1008 1008


In [13]:
def createVocabularyBigram(text):
    bi_dic = {}
    for line in text:
        sentence = line.split()
        for ix, word in enumerate(sentence):
            try:
                if sentence[ix + 1]:
                    word = word.strip() #Removes blanks and '\n' from the end when needed.
                    next_word = sentence[ix + 1].strip()
                    bi_gram = word + ' ' + next_word
                    bi_dic[bi_gram] = bi_dic.get(bi_gram, 0) + 1
            except:
                continue
                
    return bi_dic
bigram_vocabulary = createVocabularyBigram(train_cleaned_tweets)

In [14]:
print('Number of unique bigrams:')
print(len(bigram_vocabulary))

Number of unique bigrams:
71652


### word2index is a dictionary that maps a word to an index, i.e corresponding place in input vector
### index2word is a dictionary that maps an index (from the input vector-space) to a word.

In [15]:
import collections

def createWordMapper(vocabulary, vocabSize):
    '''
    Takes a vocabulary (dictionary) of variable size and a desired vocabulary size as input. 
    Returns the two mappers: word2index & index2word.
    
    NB, when the mappers are created we don't use the original vocabulary anymore (i think).
    '''

    # Order the dictinoary in order to extract the most frequent
    sortedVocabulary = sorted(vocabulary.items(), key=lambda kv: kv[1], reverse=True)

    word2index = collections.OrderedDict()
    index2word = collections.OrderedDict()

    for i in range(vocabSize):
        index = len(word2index)
        word = sortedVocabulary[i][0]
        
        word2index[word] = index
        index2word[index] = word
    word2index['UNK'] = index + 1
    index2word[index + 1] = 'UNK'
            
    return word2index, index2word

In [16]:
# vocabularySize = len(bigram_vocabulary) # Can't use this because memory overflow, but we want to!!!

vocabularySize = 30000

bigram2index, index2bigram = createWordMapper(bigram_vocabulary, vocabularySize - 1)
print('Size of vocabulary: ' +str(len(bigram2index)))
print('Maximum size of vocabulary: ' + str(len(bigram_vocabulary)))

Size of vocabulary: 30000
Maximum size of vocabulary: 71652


### Ilustration

In [17]:
test = bigram2index.get('jag gillar')
print('index: ', test)
print('word: ', index2bigram.get(test))

index:  383
word:  jag gillar


### Create input vectors with BoW approach

In [18]:
def getBigramVector(tweet):
    vec = np.zeros([vocabularySize, 1])
    sentence = tweet.split()
    for ix, word in enumerate(sentence):
        
        try:
            if sentence[ix + 1]:
                word = word.strip() #Removes blanks and '\n' from the end when needed.
                next_word = sentence[ix + 1].strip()
                bi_gram = word + ' ' + next_word
                
                if bigram2index.get(bi_gram) is not None:
                    vec[bigram2index.get(bi_gram)] += 1
                    
                else:
                    vec[bigram2index.get('UNK')] += 1
                
        except:
            continue

    return vec

            
train_x_vectorized = []
for tweet in train_cleaned_tweets:
    train_x_vectorized.append(getBigramVector(tweet))
    
test_x_vectorized = []
for tweet in test_cleaned_tweets:
    test_x_vectorized.append(getBigramVector(tweet))

In [19]:
del train_data
del test_data
del pos
del neg
del neu

In [20]:
# Remove unnecessary 1-dimension which np creates above
train_x_vectorized = np.squeeze(train_x_vectorized)
test_x_vectorized = np.squeeze(test_x_vectorized)

In [21]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score



## Linear SVM

In [22]:
from sklearn.svm import LinearSVC
clf_linear = LinearSVC(C=1, max_iter = 10000, verbose=9)

In [23]:
clf_linear.fit(train_x_vectorized, train_Y)

[LibLinear]

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=9)

In [24]:
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")

y_true, y_pred = test_Y, clf_linear.predict(test_x_vectorized)

print(classification_report(y_true, y_pred))
print()
print('Accuracy on test/evaluation set: ', accuracy_score(y_true, y_pred))
print()
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.
              precision    recall  f1-score   support

          -1       0.56      0.37      0.45       336
           0       0.43      0.65      0.52       336
           1       0.58      0.48      0.52       336

   micro avg       0.50      0.50      0.50      1008
   macro avg       0.52      0.50      0.50      1008
weighted avg       0.52      0.50      0.50      1008


Accuracy on test/evaluation set:  0.5

Confusion matrix:
[[124 154  58]
 [ 58 219  59]
 [ 38 137 161]]


### Training a calibrated classifier using platt-scaling, to produce probabilities per class

In [25]:
from sklearn.calibration import CalibratedClassifierCV

clf_calibrated = CalibratedClassifierCV(clf_linear, cv=5) 
clf_calibrated.fit(train_x_vectorized, train_Y)
y_proba = clf_calibrated.predict_proba(test_x_vectorized)

MemoryError: 

### Print tweet, probabilities and prediction per tweet in test set
#### Probabilities: [prob_neg, prob_neu, prob_pos]

In [None]:
for index, i in enumerate(test_X):
    label = y_true[index]
    pred = y_pred[index]

    print()
    print('Original tweet:')
    print(i)
    print()
    print('Preprocessed tweet:')
    print(test_cleaned_tweets[index])
    print()
    print('Probabilities:')
    print('    neg \t neu \t   pos')
    print(y_proba[index])
    print()
    print('Prediction: \t', pred)
    print('label \t\t', label)
    print()
    print('-------------------------')

### Like above, but only the missclassified tweets

In [None]:
for index, i in enumerate(test_X):
    label = y_true[index]
    pred = y_pred[index]
    if not (label == pred):
        count += 1
        print()
        print('Original tweet:')
        print(i)
        print()
        print('Preprocessed tweet:')
        print(test_cleaned_tweets[index])
        print()
        print('Probabilities:')
        print('    neg \t neu \t   pos')
        print(y_proba[index])
        print()
        print('Prediction: \t', pred)
        print('label \t\t', label)
        print()
        print('-------------------------')


## Save model

### The two crucial parts are the model itself and the word2index mapper.

In [None]:
import os
import pickle


def save_model(path, model, prob_model, w2i):
    dir_name = path.split('.')[0]
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    else:
        print('Error: \nChoose different model name for saving, as a folder with that name already exists')
        return
    
    model_path = os.path.join(dir_name, path)
    prob_model_path = os.path.join(dir_name, 'prob_'+path)
    w2i_path = os.path.join(dir_name, 'word2index.pkl')
    init_path = os.path.join(dir_name, '__init__.py')
    
    save_pickle(model_path, model)
    save_pickle(prob_model_path, prob_model)
    save_pickle(w2i_path, w2i)
    
    f = open(init_path, 'w')
    f.close()
    
def save_pickle(path, obj):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)
        
    return

### Where to save the model?

In [None]:
model_save_path = 'bigram_linear_svm.pkl' # Must be a .pkl file extension name!


save_model(model_save_path, clf_linear, clf_calibrated, word2index)

## RBF-kernel SVM (Non-linear)

In [None]:
from sklearn.svm import SVC
clf_svc = SVC(C=15, gamma=1e-2, verbose=9)

In [None]:
clf_svc.fit(train_x_vectorized, train_Y)

In [None]:
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")

y_true, y_pred = test_Y, clf_svc.predict(test_x_vectorized)

print(classification_report(y_true, y_pred))
print()
print('Accuracy on test/evaluation set: ', accuracy_score(y_true, y_pred))
print()
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

In [None]:
print(y_pred)