In [1]:
import sklearn.datasets
import re
import numpy as np
import nltk
import pandas as pd
import pickle
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.util import skipgrams
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

In [2]:
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [4]:
cv = StratifiedKFold(n_splits=10, shuffle=True)
stemmer = PorterStemmer()

stopwords= stopwords.words("english")
other_exclusions = ["ff", "rt"]
stopwords.extend(other_exclusions)

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    #return [token.strip() for token in tweet.split()]
    #tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    #tweet = " ".join(re.split("[^^a-zA-Z.,!?]*", tweet)).strip()
    tweet = " ".join(re.split("[^a-zA-Z#]+", tweet)).strip()
    #tweet = " ".join(re.split("[ ]*", tweet)).strip()
    return tweet.split()

def tokenize(tweet):
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def get_metric(vectorizer, X_raw, y_raw, name):
    result={'name':name} 
    y = y_raw
    X = vectorizer.fit_transform(X_raw)
    result['shape'] = X.shape

    aucs = []
    for train, test in cv.split(X, y):
        classifier.fit(X[train], y[train])
        y_preds = classifier.predict(X[test])
        accuracy = accuracy_score(y[test], y_preds)
        aucs.append(accuracy)

    result['accuracies']  = aucs
    result['mean_accuracy'] = np.mean(aucs)
    #result['y_preds'] = y_preds
    return result

In [9]:
classifier = LinearSVC(C=1)
vectorizer_unigrams = TfidfVectorizer(
    ngram_range=(1,1),
    stop_words=other_exclusions,
    tokenizer=basic_tokenize)

result = get_metric(vectorizer_unigrams, np.array(trainset.data), np.array(trainset.target), "unigrams-basic")
result

{'accuracies': [0.886839074944823,
  0.8853277036752711,
  0.8854181661148697,
  0.883906722326184,
  0.886854921906864,
  0.8872867733499676,
  0.8856073510712315,
  0.8853351888286386,
  0.8852632084073132,
  0.8841835020874322],
 'mean_accuracy': 0.8856022612712593,
 'name': 'unigrams-basic',
 'shape': (416809, 75300)}

In [10]:
vectorizer_bigrams = TfidfVectorizer(
    ngram_range=(2,2),
    stop_words=other_exclusions,
    tokenizer=basic_tokenize)

result = get_metric(vectorizer_bigrams, np.array(trainset.data), np.array(trainset.target), "bigrams-basic")
result

{'accuracies': [0.8379234238556761,
  0.8410181364552346,
  0.836284247396958,
  0.8402667818242887,
  0.8405508505074254,
  0.8389194117223675,
  0.8381516758235167,
  0.8382599932818273,
  0.8379480781227506,
  0.83962762128701],
 'mean_accuracy': 0.8388950220277055,
 'name': 'bigrams-basic',
 'shape': (416809, 1078742)}

In [11]:
vectorizer_trigrams = TfidfVectorizer(
    ngram_range=(3,3),
    stop_words=other_exclusions,
    tokenizer=basic_tokenize)

result = get_metric(vectorizer_trigrams, np.array(trainset.data), np.array(trainset.target), "trigrams-basic")
result

{'accuracies': [0.717685442855772,
  0.7179973131177431,
  0.7204308814356317,
  0.721342545943093,
  0.7177131066912982,
  0.7207840502867013,
  0.7194405124637124,
  0.7181726570372858,
  0.7174048658764816,
  0.718028696194635],
 'mean_accuracy': 0.7189000071902354,
 'name': 'trigrams-basic',
 'shape': (416809, 3132203)}

In [13]:
def skipgram_tokenize(tweet, n=None, k=None, include_all=True):
    tokens = [w for w in basic_tokenize(tweet)]
    if include_all:
        result = []
        for i in range(k+1):
            skg = [w for w in skipgrams(tokens, n, i)]
            result = result+skg
    else:
        result = [w for w in skipgrams(tokens, n, k)]
    return result

def make_skip_tokenize(n, k, include_all=True):
    return lambda tweet: skipgram_tokenize(tweet, n=n, k=k, include_all=include_all)

In [14]:
vectorizer_1skipbigram = TfidfVectorizer(stop_words=other_exclusions,
                                       tokenizer=make_skip_tokenize(n=2, k=1))

result = get_metric(vectorizer_1skipbigram, np.array(trainset.data), np.array(trainset.target), "1-skip-bigrams-basic")
result

{'accuracies': [0.8546204778812014,
  0.857211400057576,
  0.8592198071109831,
  0.8580682308910321,
  0.8574890237758211,
  0.8574890237758211,
  0.859672272738178,
  0.8597581457843466,
  0.8564470464033783,
  0.858198569988963],
 'mean_accuracy': 0.8578173998407301,
 'name': '1-skip-bigrams-basic',
 'shape': (416809, 2173548)}

In [15]:
vectorizer_2skipbigram = TfidfVectorizer(stop_words=other_exclusions,
                                       tokenizer=make_skip_tokenize(n=2, k=2))
    
result = get_metric(vectorizer_2skipbigram, np.array(trainset.data), np.array(trainset.target), "2-skip-bigrams-basic")
result

{'accuracies': [0.8657278572114001,
  0.8690384799923232,
  0.8625065975720935,
  0.8649297058682405,
  0.865694201194789,
  0.8641347376502483,
  0.8639667954223748,
  0.8641009645376457,
  0.8640289841163203,
  0.865396612121503],
 'mean_accuracy': 0.8649524935686937,
 'name': '2-skip-bigrams-basic',
 'shape': (416809, 3148371)}

In [16]:
vectorizer_3skipbigram = TfidfVectorizer(stop_words=other_exclusions,
                                       tokenizer=make_skip_tokenize(n=2, k=3))
result = get_metric(vectorizer_3skipbigram, np.array(trainset.data), np.array(trainset.target), "3-skip-bigrams-basic")
result

{'accuracies': [0.8679589290855004,
  0.866735438057768,
  0.8686723285830814,
  0.8700638165155223,
  0.8667498380557088,
  0.87118831122094,
  0.8691490127396175,
  0.8675320312874898,
  0.8699793656125534,
  0.8694275157157253],
 'mean_accuracy': 0.8687456586873907,
 'name': '3-skip-bigrams-basic',
 'shape': (416809, 3970158)}

In [17]:
vectorizer_character_bigram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(2,2))
result = get_metric(vectorizer_character_bigram, np.array(trainset.data), np.array(trainset.target), "character bigrams")
result

{'accuracies': [0.5996305536896651,
  0.5999424239516361,
  0.5961326231946643,
  0.5985317403195624,
  0.599553753508793,
  0.5976823972553441,
  0.5958350327487344,
  0.5985891837420222,
  0.5962618167858342,
  0.5992849944815011],
 'mean_accuracy': 0.5981444519677757,
 'name': 'character bigrams',
 'shape': (416809, 719)}

In [18]:
vectorizer_character_trigram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(3,3))
result = get_metric(vectorizer_character_trigram, np.array(trainset.data), np.array(trainset.target), "character trigrams")
result

{'accuracies': [0.8545485078207465,
  0.8570674599366663,
  0.8541336788061993,
  0.8529101290725013,
  0.8554017418008205,
  0.8570331805858784,
  0.8540821957246707,
  0.8535678295503623,
  0.8528240318633332,
  0.8530639666010845],
 'mean_accuracy': 0.8544632721762262,
 'name': 'character trigrams',
 'shape': (416809, 11654)}

In [19]:
vectorizer_character_4gram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(4,4))
result = get_metric(vectorizer_character_4gram, np.array(trainset.data), np.array(trainset.target), "character 4-grams")
result

{'accuracies': [0.8830246617407158,
  0.8833605220228385,
  0.8822753226812533,
  0.8817954992562737,
  0.8831841846404836,
  0.8833761186151964,
  0.8803771502603105,
  0.8816641873410432,
  0.8830078218724507,
  0.8822400307116465],
 'mean_accuracy': 0.8824305499142213,
 'name': 'character 4-grams',
 'shape': (416809, 77730)}

In [20]:
vectorizer_character_5gram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(5,5))
result = get_metric(vectorizer_character_5gram, np.array(trainset.data), np.array(trainset.target), "character 5-grams")
result

{'accuracies': [0.8752039151712887,
  0.8788983782746378,
  0.8777889736576939,
  0.8748860419365674,
  0.8767783882344473,
  0.8762505698039874,
  0.876418512031861,
  0.8748020538413551,
  0.8786650031191516,
  0.8798406833341331],
 'mean_accuracy': 0.8769532519405123,
 'name': 'character 5-grams',
 'shape': (416809, 282187)}

In [21]:
vectorizer_character_6gram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(6,6))
result = get_metric(vectorizer_character_6gram, np.array(trainset.data), np.array(trainset.target), "character 6-grams")
result

{'accuracies': [0.8755157854332598,
  0.8773150369446311,
  0.8739024039153591,
  0.8726308718391632,
  0.873659461145366,
  0.874787073246803,
  0.8748830402341594,
  0.8730025433082201,
  0.8732664715197467,
  0.87477806036758],
 'mean_accuracy': 0.874374074795429,
 'name': 'character 6-grams',
 'shape': (416809, 773532)}

In [22]:
vectorizer_character_7gram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(7,7))

result = get_metric(vectorizer_character_7gram, np.array(trainset.data), np.array(trainset.target), "character 7-grams")
result

{'accuracies': [0.8736925439017369,
  0.8703579311006622,
  0.8731586776066408,
  0.8717671896741999,
  0.8700367073726638,
  0.8704925505626064,
  0.8684292603344449,
  0.8723067325687412,
  0.8704112481405057,
  0.8724746868851673],
 'mean_accuracy': 0.871312752814737,
 'name': 'character 7-grams',
 'shape': (416809, 1710117)}

In [23]:
vectorizer_character_8gram = TfidfVectorizer(stop_words=other_exclusions,
                                       analyzer='char',
                                       ngram_range=(8,8))
result = get_metric(vectorizer_character_8gram, np.array(trainset.data), np.array(trainset.target), "character 8-grams")
result

{'accuracies': [0.8691584300930812,
  0.8681268592265617,
  0.8658893527181997,
  0.8656014586632119,
  0.8677574914229506,
  0.8659101269163407,
  0.866533912334157,
  0.8647727818033495,
  0.8675080378137147,
  0.8680358942367676],
 'mean_accuracy': 0.8669294345228336,
 'name': 'character 8-grams',
 'shape': (416809, 3179100)}

In [24]:
def get_metric_oracle(X_raw, y_raw, vectorizers):
    results = {"oracle":{}}
    for train, test in cv.split(X_raw, y_raw):
        y_train = y_raw[train]
        X_train = X_raw[train]
        
        y_test = y_raw[test]
        X_test = X_raw[test]
        
        y_pred_oracle = []
        for name in vectorizers:
            vectorizer = vectorizers[name]
            if name in results:
                result = results[name]
            else:
                result = {}
                results[name] = result
                
            X_train_tr = vectorizer.fit_transform(X_train)
            
            if not "shape" in result:
                result["shape"] = []
            result['shape'].append(X_train_tr.shape)
            classifier.fit(X_train_tr, y_train)
            X_test_tr = vectorizer.transform(X_test)
            y_preds = classifier.predict(X_test_tr)
            accuracy = accuracy_score(y_test, y_preds)
            
            if not "accuracies" in result:
                result["accuracies"] = []           
            
            result['accuracies'].append(accuracy)
            
            if not "y_preds" in result:
                result["y_preds"] = []
                
            result['y_preds'].append(y_preds)   
            
            y_pred_oracle.append(y_preds)
            
        y_pred_oracle = np.matrix(y_pred_oracle).T
        oracle_correct_pred = 0
        oracle_incorrect_index = []
        for i, yt in enumerate(y_test):
            if True in  (y_pred_oracle[i,:] == yt):
                 oracle_correct_pred += 1
            else:
                oracle_incorrect_index.append(test[i])
                
        accuracy = oracle_correct_pred/len(y_test)
        print("Oracle classifier accuracy={}".format(accuracy))
        result = results["oracle"]
    
        if not "accuracies" in result:
            result["accuracies"] = []           
            
        result['accuracies'].append(accuracy)
        
        if not "oracle_incorrect_index" in result:
            result["oracle_incorrect_index"] = []  
            
        result['oracle_incorrect_index'] = oracle_incorrect_index
    return results

In [25]:
vectorizers = {"vectorizer_character_8gram":vectorizer_character_8gram,
              "vectorizer_character_7gram":vectorizer_character_7gram,
              "vectorizer_character_6gram":vectorizer_character_6gram,
              "vectorizer_character_5gram":vectorizer_character_5gram,
              "vectorizer_character_4gram":vectorizer_character_4gram,
              "vectorizer_1skipbigram": vectorizer_1skipbigram,
              "vectorizer_2skipbigram": vectorizer_2skipbigram,
              "vectorizer_3skipbigram": vectorizer_3skipbigram,
              "vectorizer_unigrams": vectorizer_unigrams,
              "vectorizer_bigrams": vectorizer_bigrams,
              "vectorizer_trigrams": vectorizer_trigrams}
               
results = get_metric_oracle(np.array(trainset.data), np.array(trainset.target), vectorizers)

Oracle classifier accuracy=0.9312446022454659
Oracle classifier accuracy=0.9294453507340946
Oracle classifier accuracy=0.9295859123842426
Oracle classifier accuracy=0.9295859123842426
Oracle classifier accuracy=0.9305438929008422
Oracle classifier accuracy=0.9279287924953816
Oracle classifier accuracy=0.9291523715841751
Oracle classifier accuracy=0.9278516243581746
Oracle classifier accuracy=0.9277316569892989
Oracle classifier accuracy=0.9281395460434761


In [26]:
incorrect_indexes = sorted(set(results["oracle"]["oracle_incorrect_index"]))
print(len(incorrect_indexes))

2995


In [28]:
X_incorrect = np.array(trainset.data)[incorrect_indexes]
y_incorrect = np.array(trainset.target)[incorrect_indexes]
incorrect_classified = pd.DataFrame()
incorrect_classified["text"] = X_incorrect
incorrect_classified["label"] = y_incorrect
incorrect_classified

Unnamed: 0,text,label
0,i actually did a good job teaching them and or...,5
1,i agreed many months ago and as the time got c...,5
2,i also had a gazillion other things that just ...,5
3,i also kind of stop keeping up with blogs when...,5
4,i also often feel a little overwhelmed by my n...,5
5,i am also noticing that i can only handle so m...,5
6,i am assuming you guys too feel if you think i...,5
7,i am avoiding spending money it definitely fee...,5
8,i am constantly feeling overwhelmed about my f...,5
9,i am enough even when i feel weird,5


In [29]:
incorrect_classified.label.value_counts()

2    699
3    664
1    586
5    374
0    359
4    313
Name: label, dtype: int64

In [30]:
summary = []
for name in results:
    result = results[name]
    accuracies = result["accuracies"]
    summary.append({"name": name, "accuracy":np.mean(accuracies)})
df_summary = pd.DataFrame(summary)
df_summary = df_summary.sort_values(by=['accuracy'],ascending=False)
df_summary = df_summary.reset_index()
df_summary

Unnamed: 0,index,accuracy,name
0,7,0.929121,oracle
1,2,0.885732,vectorizer_unigrams
2,10,0.882339,vectorizer_character_4gram
3,8,0.877203,vectorizer_character_5gram
4,6,0.87446,vectorizer_character_6gram
5,11,0.87185,vectorizer_character_7gram
6,9,0.870603,vectorizer_3skipbigram
7,4,0.867592,vectorizer_character_8gram
8,3,0.866658,vectorizer_2skipbigram
9,5,0.859909,vectorizer_1skipbigram
