In [1]:
from collections import Counter
import json
from pathlib import Path
from nltk.tokenize import WordPunctTokenizer
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score

#### Create vocab from training data

In [41]:
data_train_df = pd.read_csv('data/Twitter/hate_twitter/train_clean.csv')

# using code from https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
train, val, test = np.split(data_train_df.sample(frac=1, random_state=8),\
    [int(0.7*len(data_train_df)),int(0.85*len(data_train_df))])
print(train.shape)
print(val.shape)
print(test.shape)

(22373, 8)
(4794, 8)
(4795, 8)


In [42]:
data_train_df.shape

(31962, 8)

In [43]:
train.to_csv('data/Twitter/hate_twitter/hate_train.csv')
val.to_csv('data/Twitter/hate_twitter/hate_val.csv')
test.to_csv('data/Twitter/hate_twitter/hate_test.csv')

In [2]:
train_df = pd.read_csv('data/Twitter/hate_twitter/hate_train.csv')

In [3]:
data_train = list(train_df[train_df['clean_tweet'].notna()]['clean_tweet'])
data_train[0:5]

['omg omg omg yay found wonderful price segasaturn throwback',
 'payintheusa polar bear climb racing angry polar bear climb racing polar bear living cold place',
 'trainhard polar bear climb racing angry polar bear climb racing polar bear living cold places lo',
 'turn resignation',
 'happy bihday hajime hosogai bihday bihday 30']

In [4]:
# funky behavior with bad tweets - do we want to capture some of these?
train_df[train_df['clean_tweet'].isna()][0:5]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,label,tweet,hash_tag,clean_tweet,tokenized_tweet,tokenized_tweet_NLTK
875,21970,21970,21971,0,@user not to me.,[],,<user> not to me.,@user
2122,15494,15494,15495,0,@user @user me to!,[],,<user> <user> me to!,@user @user
3865,16320,16320,16321,0,@user @user ð very,[],,<user> <user> ð very,@user @user
4114,5028,5028,5029,0,@user as you should.,[],,<user> as you should.,@user
5241,15434,15434,15435,0,ðð» that is all....,[],,ðð» that is all. <repeat>,


In [40]:
data_dir = Path('data/Twitter/hate_twitter/')  # Modify the path of `data_dir` as needed.
tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])


# create unigram vocab
tweet_lens  = []
for i, line in enumerate(data_train):
    #print(i)
    tokens = tokenizer.tokenize(line.strip())
    tokens = [t.lower() for t in tokens]
    tweet_lens.append(len(tokens))
    counter.update(tokens)
    if i < 3:
        print(f"String of line {i}: {line.strip()}")
        print(f"Tokens of line {i}: {tokens}")
counter = dict(counter)

vocab = {}
# Populate the vocabulary with words that appear at least 3 times.
for word, freq in counter.items():
    if freq < 3 and word not in ['<pad>', '<unk>']:
        continue
    vocab[word] = len(vocab)

output_filepath = data_dir.joinpath('unigram_vocab.json')
json.dump(vocab, open(output_filepath, mode='w'))

String of line 0: omg omg omg yay found wonderful price segasaturn throwback
Tokens of line 0: ['omg', 'omg', 'omg', 'yay', 'found', 'wonderful', 'price', 'segasaturn', 'throwback']
String of line 1: payintheusa polar bear climb racing angry polar bear climb racing polar bear living cold place
Tokens of line 1: ['payintheusa', 'polar', 'bear', 'climb', 'racing', 'angry', 'polar', 'bear', 'climb', 'racing', 'polar', 'bear', 'living', 'cold', 'place']
String of line 2: trainhard polar bear climb racing angry polar bear climb racing polar bear living cold places lo
Tokens of line 2: ['trainhard', 'polar', 'bear', 'climb', 'racing', 'angry', 'polar', 'bear', 'climb', 'racing', 'polar', 'bear', 'living', 'cold', 'places', 'lo']


In [44]:
np.max(tweet_lens)

21

In [9]:
tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])

# create bigram vocab
for i, line in enumerate(data_train):
    tokens = tokenizer.tokenize(line.strip())
    # tokens = line.split()
    tokens = [t.lower() for t in tokens]
    tokens = [t if t in vocab else '<unk>' for t in tokens]
    bigram_lst = [tokens[i] + " " + tokens[i + 1] for i in range(len(tokens) - 1)]
    counter.update([tokens[i] + " " + tokens[i + 1] for i in range(len(tokens) - 1)])
    if i < 3:
        print(f"String of line {i}: {line.strip()}")
        print(f"Tokens of line {i}: {bigram_lst}")
counter = dict(counter)
print(f"Vocab size before frequency filtering: {len(counter)}")

vocab = {}
for word, freq in list(counter.items()):
    if freq < 3:
        continue
    vocab[word] = len(vocab)

print(f"Vocab size after frequency filtering: {len(vocab)}")
output_filepath = data_dir.joinpath('bigram_vocab.json')
json.dump(vocab, open(output_filepath, mode='w'))

String of line 0: omg omg omg yay found wonderful price segasaturn throwback
Tokens of line 0: ['omg omg', 'omg omg', 'omg yay', 'yay found', 'found wonderful', 'wonderful price', 'price <unk>', '<unk> throwback']
String of line 1: payintheusa polar bear climb racing angry polar bear climb racing polar bear living cold place
Tokens of line 1: ['payintheusa polar', 'polar bear', 'bear climb', 'climb racing', 'racing angry', 'angry polar', 'polar bear', 'bear climb', 'climb racing', 'racing polar', 'polar bear', 'bear living', 'living cold', 'cold place']
String of line 2: trainhard polar bear climb racing angry polar bear climb racing polar bear living cold places lo
Tokens of line 2: ['trainhard polar', 'polar bear', 'bear climb', 'climb racing', 'racing angry', 'angry polar', 'polar bear', 'bear climb', 'climb racing', 'racing polar', 'polar bear', 'bear living', 'living cold', 'cold places', 'places lo']
Vocab size before frequency filtering: 82684
Vocab size after frequency filterin

#### Extract features from training data

In [48]:
def extract_features(vocab, data_dir, feature_field, tokenizer, feature_name):
    """
    Extract and save different features based on vocab of the features.
    # Parameters
    vocab : `dict[str, int]`, required.
        A map from the word type to the index of the word.
    data_dir : `Path`, required.
        Directory of the dataset
    tokenizer : `Callable`, required.
        Tokenizer with a method `.tokenize` which returns list of tokens.
    feature_name : `str`, required.
        Name of the feature, such as unigram_binary.
    # Returns
        `None`
    """
    # Extract and save the vocab and features.

    data_dir = Path(data_dir)
    splits = ['train','test','val']
    #splits = ['train']

    gram, mode = feature_name.split('_')
    if gram not in ['unigram', 'bigram'] or mode not in ['binary', 'count']:
        raise NotImplementedError

    for split in splits:
        datapath = data_dir.joinpath(f'hate_{split}.csv')
        print('datapath',datapath)
        data_df = pd.read_csv(datapath)
        data_df = data_df[data_df[feature_field].notna()]
        print('data df cols',data_df.columns)
        features = list(data_df[feature_field])
        
        sent_lengths = []
        values, rows, cols = [], [], []
        labels = list(data_df['label'])
        print(f"\nExtract {gram} {mode} features from {datapath}")
        for i, line in enumerate(features):
            if i % 1000 == 1:
                print(f"Processing {i}/{len(features)} row")
            #label = int(line[0])
            tokens = tokenizer.tokenize(line.strip())
            # tokens = line[1:].strip().split(  )  # Tokenizing differently affects the results.
            tokens = [t.lower() for t in tokens]
            tokens = [t if t in vocab else '<unk>' for t in tokens]
            if gram.find('bigram') != -1:
                #print('yes bigram')
                tokens.extend(
                    [tokens[i] + ' ' + tokens[i + 1] for i in range(len(tokens) - 1)])
            feature = {}
            for tk in tokens:
                if tk not in vocab:
                    continue
                elif mode == 'binary':
                    feature[vocab[tk]] = 1
                elif mode == 'count':
                    feature[vocab[tk]] = feature.get(vocab[tk], 0) + 1
                else:
                    raise NotImplementedError
            for j in feature:
                values.append(feature[j])
                rows.append(i)
                cols.append(j)
            sent_lengths.append(len(tokens))
            #labels.append(label)

        features = sparse.csr_matrix((values, (rows, cols)),
                                     shape=(len(features), len(vocab)))
        print(f"{split} feature matrix shape: {features.shape}")
        output_feature_filepath = data_dir.joinpath(f'{split}_{gram}_{mode}_features.npz')
        sparse.save_npz(output_feature_filepath, features)

        np_labels = np.asarray(labels)
        print(f"{split} label array shape: {np_labels.shape}")
        output_label_filepath = data_dir.joinpath(f'{split}_labels.npz')
        np.savez(output_label_filepath, np_labels)


In [13]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/unigram_vocab.json"

extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='unigram_binary')
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='unigram_count')

datapath data/Twitter/hate_twitter/hate_train.csv
data df cols Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'label', 'tweet', 'hash_tag',
       'clean_tweet', 'tokenized_tweet', 'tokenized_tweet_NLTK'],
      dtype='object')

Extract unigram binary features from data/Twitter/hate_twitter/hate_train.csv
Processing 1/22350 row
Processing 1001/22350 row
Processing 2001/22350 row
Processing 3001/22350 row
Processing 4001/22350 row
Processing 5001/22350 row
Processing 6001/22350 row
Processing 7001/22350 row
Processing 8001/22350 row
Processing 9001/22350 row
Processing 10001/22350 row
Processing 11001/22350 row
Processing 12001/22350 row
Processing 13001/22350 row
Processing 14001/22350 row
Processing 15001/22350 row
Processing 16001/22350 row
Processing 17001/22350 row
Processing 18001/22350 row
Processing 19001/22350 row
Processing 20001/22350 row
Processing 21001/22350 row
Processing 22001/22350 row
train feature matrix shape: (22350, 7562)
train label array shape: (22350,)
datapath data

In [14]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/bigram_vocab.json"

extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='bigram_count')
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='bigram_binary')

datapath data/Twitter/hate_twitter/hate_train.csv
data df cols Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'label', 'tweet', 'hash_tag',
       'clean_tweet', 'tokenized_tweet', 'tokenized_tweet_NLTK'],
      dtype='object')

Extract bigram count features from data/Twitter/hate_twitter/hate_train.csv
Processing 1/22350 row
Processing 1001/22350 row
Processing 2001/22350 row
Processing 3001/22350 row
Processing 4001/22350 row
Processing 5001/22350 row
Processing 6001/22350 row
Processing 7001/22350 row
Processing 8001/22350 row
Processing 9001/22350 row
Processing 10001/22350 row
Processing 11001/22350 row
Processing 12001/22350 row
Processing 13001/22350 row
Processing 14001/22350 row
Processing 15001/22350 row
Processing 16001/22350 row
Processing 17001/22350 row
Processing 18001/22350 row
Processing 19001/22350 row
Processing 20001/22350 row
Processing 21001/22350 row
Processing 22001/22350 row
train feature matrix shape: (22350, 7933)
train label array shape: (22350,)
datapath data/T

In [15]:
def fit_and_eval_logistic_regression(data_dir: Path,
                                     feature_name: str,
                                     tune: bool = False) -> LogisticRegression:
    """
    Fit and evaluate the logistic regression model using the scikit-learn library.
    # Parameters
    data_dir : `Path`, required
        The data directory.
    feature_name : `str`, required.
        Name of the feature, such as unigram_binary.
    tune : `bool`, optional
        Whether or not to tune the hyperparameters of the regularization strength
        of the model of the [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
    # Returns
        model_trained: `LogisticRegression`
            The object of `LogisticRegression` after it is trained.
    """
    # Implement logistic regression with scikit-learn.
    # Print out the accuracy scores on dev and test data.

    splits = ['train', 'val','test']
    features, labels = {}, {}

    for split in splits:
        features_path = data_dir.joinpath(f'{split}_{feature_name}_features.npz')
        labels_path = data_dir.joinpath(f'{split}_labels.npz')
        features[split] = sparse.load_npz(features_path)
        labels[split] = np.load(labels_path)['arr_0']
    best_dev, best_model = 0, None
    if tune:
        for c in np.linspace(-5, 5, 11):
            clf = LogisticRegression(random_state=42,
                                     max_iter=100,
                                     fit_intercept=False,
                                     C=np.exp2(c))
            clf.fit(features['train'], labels['train'])
            dev_preds = clf.predict(features['val'])
            dev_accuracy = accuracy_score(labels['val'], dev_preds)
            print(c, dev_accuracy)
            if dev_accuracy > best_dev:
                best_dev, best_model = dev_accuracy, clf
    else:
        best_model = LogisticRegression(random_state=42,
                                        max_iter=100,
                                        fit_intercept=False)
        best_model.fit(features['train'], labels['train'])

    preds = {
        'val': best_model.predict(features['val']),
        'test': best_model.predict(features['test'])
    }
    for splt, splt_preds in preds.items():
        print("{} accuracy: {:.4f}".format(splt, accuracy_score(labels[splt],
                                                                splt_preds))),
        print("{} binary recall: {:.4f}".format(splt, recall_score(labels[splt],
                                                                splt_preds))),
        print("{} macro recall: {:.4f}".format(splt, recall_score(labels[splt],
                                                                splt_preds, 
                                                                average="macro"))),
        print("{} macro f1: {:.4f}".format(
            splt, f1_score(labels[splt], splt_preds, average='macro')))

    return best_model


In [28]:
# confirm bigram binary different from bigram count
split = 'val'
feature_name='bigram_binary'
features = {}
features_path = Path(data_dir).joinpath(f'{split}_{feature_name}_features.npz')
#labels_path = data_dir.joinpath(f'{split}_labels.npz')
features[split] = sparse.load_npz(features_path)
features[split].toarray()[0:5,18:19]

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [27]:
# indices where nonzero
np.nonzero(features[split].toarray())

(array([   0,    1,    2, ..., 4787, 4788, 4789]),
 array([18, 18, 18, ..., 18, 18, 18]))

In [29]:
# confirm bigram binary different from bigram count
split = 'val'
feature_name='bigram_count'
features = {}
features_path = Path(data_dir).joinpath(f'{split}_{feature_name}_features.npz')
#labels_path = data_dir.joinpath(f'{split}_labels.npz')
features[split] = sparse.load_npz(features_path)
features[split].toarray()[0:5,18:19]

array([[3],
       [7],
       [5],
       [1],
       [6]])

In [18]:
fit_and_eval_logistic_regression(feature_name='unigram_binary',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.9559
val binary recall: 0.5145
val macro recall: 0.7523
val macro f1: 0.8016
test accuracy: 0.9570
test binary recall: 0.5296
test macro recall: 0.7586
test macro f1: 0.8000


LogisticRegression(fit_intercept=False, random_state=42)

In [19]:
fit_and_eval_logistic_regression(feature_name='unigram_count',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.9547
val binary recall: 0.5116
val macro recall: 0.7503
val macro f1: 0.7973
test accuracy: 0.9558
test binary recall: 0.5265
test macro recall: 0.7565
test macro f1: 0.7955


LogisticRegression(fit_intercept=False, random_state=42)

In [20]:
fit_and_eval_logistic_regression(feature_name='bigram_binary',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.9282
val binary recall: 0.0000
val macro recall: 0.5000
val macro f1: 0.4814
test accuracy: 0.9330
test binary recall: 0.0000
test macro recall: 0.5000
test macro f1: 0.4827


LogisticRegression(fit_intercept=False, random_state=42)

In [21]:
fit_and_eval_logistic_regression(feature_name='bigram_count',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.9282
val binary recall: 0.0000
val macro recall: 0.5000
val macro f1: 0.4814
test accuracy: 0.9330
test binary recall: 0.0000
test macro recall: 0.5000
test macro f1: 0.4827


LogisticRegression(fit_intercept=False, random_state=42)

#### Show important weights

Code heavily modeled on code from Chenhao Tan's Winter 2022 NLP Course at the University of Chicago

In [37]:
def print_important_weights(weights, words):
    """
    Print important pairs of weights and words.
    # Parameters
    weights : `Iterable`, required.
        Weights from a learned model.
    words : `Iterable`, required.
        Word types of the vocabulary.  
        It must be true that `len(weights) == len(words)`.
    # Returns
        `None`
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print("Most hateful words:")
    print_pairs(pairs[:10])
    print("\nLeast hateful words:")
    print_pairs(reversed(pairs[-10:]))

    pairs = list(zip(abs(weights), words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=False)
    print("\nMost neutral words:")
    print_pairs(pairs[:10])

In [39]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/unigram_vocab.json"

model_trained: LogisticRegression = fit_and_eval_logistic_regression(
    feature_name='unigram_binary', data_dir=Path(data_dir), tune=False)
weights = model_trained.coef_[0]
vocab = json.load(open(vocab_filepath))
print("")
print_important_weights(weights=weights, words=vocab.keys())

val accuracy: 0.9559
val binary recall: 0.5145
val macro recall: 0.7523
val macro f1: 0.8016
test accuracy: 0.9570
test binary recall: 0.5296
test macro recall: 0.7586
test macro f1: 0.8000

Most hateful words:
 4.2309 | allahsoil
 3.1589 | racism
 2.6945 | misogyny
 2.6012 | bigot
 2.5492 | 2017
 2.5025 | white
 2.2742 | latest
 2.1515 | racist
 2.1493 | treason
 2.1267 | misogynist

Least hateful words:
-3.0948 | bihday
-2.6052 | day
-2.5491 | orlando
-2.3904 | positive
-2.3258 | healthy
-2.2211 | friday
-2.1652 | weekend
-2.1620 | hardcore
-2.1542 | thankful
-2.1484 | days

Most neutral words:
 0.0000 | <pad>
 0.0000 | remains
 0.0001 | grandad
 0.0001 | awasome
 0.0001 | inwoo
 0.0002 | selfrespect
 0.0003 | titanic
 0.0003 | lauren
 0.0004 | ebay
 0.0005 | systems


## Upsampled Twitter Data

In [45]:
upsampled_train_df = pd.read_csv('data/Twitter/hate_twitter/train_upsampled.csv')

# using code from https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
train_up, val_up, test_up = np.split(upsampled_train_df.sample(frac=1, random_state=8),\
    [int(0.7*len(upsampled_train_df)),int(0.85*len(upsampled_train_df))])
print(train_up.shape)
print(val_up.shape)
print(test_up.shape)

(41608, 8)
(8916, 8)
(8916, 8)


In [46]:
train_up.head()

Unnamed: 0.1,Unnamed: 0,id,label,tweet,hash_tag,clean_tweet,tokenized_tweet,tokenized_tweet_NLTK
44049,15405,15406,0,@user we really are. lmaoo we are obviously re...,[],really lmaoo obviously related haha wonder cam...,<user> we really are. lmaoo we are obviously r...,@user really lmaoo obviously related haha wond...
33142,3684,3685,0,i think my #hea need a #bandage again ð #...,"['hea', 'bandage', 'love']",think hea need bandage love,i think my <hashtag> hea need a <hashtag> band...,think hea need bandage love
40964,12093,12094,0,new phone #xperiaz3+ #xperiaz4 #sony #thebeast...,"['xperiaz3', 'xperiaz4', 'sony', 'thebeast', '...",new phone xperiaz3 xperiaz4 sony thebeast copp...,new phone <hashtag> xperiaz<number>+ <hashtag>...,new phone xperiaz3 xperiaz4 sony thebeast copp...
53102,25149,25150,0,#dinner at my friend's #steak #shop never #d...,"['dinner', 'steak', 'shop', 'disappoint', 'yum...",dinner friends steak shop never disappoint yum...,<hashtag> dinner at my friend's <hashtag> st...,dinner friend's steak shop never disappoint yu...
15142,12744,12745,1,@user @user @user and sorry if we are ethnical...,[],sorry ethnically cleansing east jerusalem beth...,<user> <user> <user> and sorry if we are ethni...,@user @user @user sorry ethnically cleansing e...


In [47]:
train_up.to_csv('data/Twitter/hate_twitter/hate_upsampled_train.csv')
val_up.to_csv('data/Twitter/hate_twitter/hate_upsampled_val.csv')
test_up.to_csv('data/Twitter/hate_twitter/hate_upsampled_test.csv')

In [50]:
def extract_features(vocab, data_dir, feature_field, tokenizer, feature_name):
    """
    Extract and save different features based on vocab of the features.
    # Parameters
    vocab : `dict[str, int]`, required.
        A map from the word type to the index of the word.
    data_dir : `Path`, required.
        Directory of the dataset
    tokenizer : `Callable`, required.
        Tokenizer with a method `.tokenize` which returns list of tokens.
    feature_name : `str`, required.
        Name of the feature, such as unigram_binary.
    # Returns
        `None`
    """
    # Extract and save the vocab and features.

    data_dir = Path(data_dir)
    splits = ['train','test','val']
    #splits = ['train']

    gram, mode = feature_name.split('_')
    if gram not in ['unigram', 'bigram'] or mode not in ['binary', 'count']:
        raise NotImplementedError

    for split in splits:
        datapath = data_dir.joinpath(f'hate_upsampled_{split}.csv')
        print('datapath',datapath)
        data_df = pd.read_csv(datapath)
        data_df = data_df[data_df[feature_field].notna()]
        print('data df cols',data_df.columns)
        features = list(data_df[feature_field])
        
        sent_lengths = []
        values, rows, cols = [], [], []
        labels = list(data_df['label'])
        print(f"\nExtract {gram} {mode} features from {datapath}")
        for i, line in enumerate(features):
            if i % 1000 == 1:
                print(f"Processing {i}/{len(features)} row")
            #label = int(line[0])
            tokens = tokenizer.tokenize(line.strip())
            # tokens = line[1:].strip().split(  )  # Tokenizing differently affects the results.
            tokens = [t.lower() for t in tokens]
            tokens = [t if t in vocab else '<unk>' for t in tokens]
            if gram.find('bigram') != -1:
                #print('yes bigram')
                tokens.extend(
                    [tokens[i] + ' ' + tokens[i + 1] for i in range(len(tokens) - 1)])
            feature = {}
            for tk in tokens:
                if tk not in vocab:
                    continue
                elif mode == 'binary':
                    feature[vocab[tk]] = 1
                elif mode == 'count':
                    feature[vocab[tk]] = feature.get(vocab[tk], 0) + 1
                else:
                    raise NotImplementedError
            for j in feature:
                values.append(feature[j])
                rows.append(i)
                cols.append(j)
            sent_lengths.append(len(tokens))
            #labels.append(label)

        features = sparse.csr_matrix((values, (rows, cols)),
                                     shape=(len(features), len(vocab)))
        print(f"{split} feature matrix shape: {features.shape}")
        output_feature_filepath = data_dir.joinpath(f'{split}_{gram}_{mode}_upsamp_features.npz')
        sparse.save_npz(output_feature_filepath, features)

        np_labels = np.asarray(labels)
        print(f"{split} label array shape: {np_labels.shape}")
        output_label_filepath = data_dir.joinpath(f'{split}_upsamp_labels.npz')
        np.savez(output_label_filepath, np_labels)


In [51]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/unigram_vocab.json"

extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='unigram_binary')
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='unigram_count')

datapath data/Twitter/hate_twitter/hate_upsampled_train.csv
data df cols Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'label', 'tweet', 'hash_tag',
       'clean_tweet', 'tokenized_tweet', 'tokenized_tweet_NLTK'],
      dtype='object')

Extract unigram binary features from data/Twitter/hate_twitter/hate_upsampled_train.csv
Processing 1/41560 row
Processing 1001/41560 row
Processing 2001/41560 row
Processing 3001/41560 row
Processing 4001/41560 row
Processing 5001/41560 row
Processing 6001/41560 row
Processing 7001/41560 row
Processing 8001/41560 row
Processing 9001/41560 row
Processing 10001/41560 row
Processing 11001/41560 row
Processing 12001/41560 row
Processing 13001/41560 row
Processing 14001/41560 row
Processing 15001/41560 row
Processing 16001/41560 row
Processing 17001/41560 row
Processing 18001/41560 row
Processing 19001/41560 row
Processing 20001/41560 row
Processing 21001/41560 row
Processing 22001/41560 row
Processing 23001/41560 row
Processing 24001/41560 row
Processing 2500

In [52]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/bigram_vocab.json"

extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='bigram_binary')
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='bigram_count')

datapath data/Twitter/hate_twitter/hate_upsampled_train.csv
data df cols Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'label', 'tweet', 'hash_tag',
       'clean_tweet', 'tokenized_tweet', 'tokenized_tweet_NLTK'],
      dtype='object')

Extract bigram binary features from data/Twitter/hate_twitter/hate_upsampled_train.csv
Processing 1/41560 row
Processing 1001/41560 row
Processing 2001/41560 row
Processing 3001/41560 row
Processing 4001/41560 row
Processing 5001/41560 row
Processing 6001/41560 row
Processing 7001/41560 row
Processing 8001/41560 row
Processing 9001/41560 row
Processing 10001/41560 row
Processing 11001/41560 row
Processing 12001/41560 row
Processing 13001/41560 row
Processing 14001/41560 row
Processing 15001/41560 row
Processing 16001/41560 row
Processing 17001/41560 row
Processing 18001/41560 row
Processing 19001/41560 row
Processing 20001/41560 row
Processing 21001/41560 row
Processing 22001/41560 row
Processing 23001/41560 row
Processing 24001/41560 row
Processing 25001

In [56]:
def fit_and_eval_logistic_regression(data_dir: Path,
                                     feature_name: str,
                                     tune: bool = False) -> LogisticRegression:
    """
    Fit and evaluate the logistic regression model using the scikit-learn library.
    # Parameters
    data_dir : `Path`, required
        The data directory.
    feature_name : `str`, required.
        Name of the feature, such as unigram_binary.
    tune : `bool`, optional
        Whether or not to tune the hyperparameters of the regularization strength
        of the model of the [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
    # Returns
        model_trained: `LogisticRegression`
            The object of `LogisticRegression` after it is trained.
    """
    # Implement logistic regression with scikit-learn.
    # Print out the accuracy scores on dev and test data.

    splits = ['train', 'val','test']
    features, labels = {}, {}

    for split in splits:
        features_path = data_dir.joinpath(f'{split}_{feature_name}_upsamp_features.npz')
        labels_path = data_dir.joinpath(f'{split}_upsamp_labels.npz')
        features[split] = sparse.load_npz(features_path)
        labels[split] = np.load(labels_path)['arr_0']
    best_dev, best_model = 0, None
    if tune:
        for c in np.linspace(-5, 5, 11):
            clf = LogisticRegression(random_state=42,
                                     max_iter=100,
                                     fit_intercept=False,
                                     C=np.exp2(c))
            clf.fit(features['train'], labels['train'])
            dev_preds = clf.predict(features['val'])
            dev_accuracy = accuracy_score(labels['val'], dev_preds)
            print(c, dev_accuracy)
            if dev_accuracy > best_dev:
                best_dev, best_model = dev_accuracy, clf
    else:
        best_model = LogisticRegression(random_state=42,
                                        max_iter=200,
                                        fit_intercept=False)
        best_model.fit(features['train'], labels['train'])

    preds = {
        'val': best_model.predict(features['val']),
        'test': best_model.predict(features['test'])
    }
    for splt, splt_preds in preds.items():
        print("{} accuracy: {:.4f}".format(splt, accuracy_score(labels[splt],
                                                                splt_preds))),
        print("{} binary recall: {:.4f}".format(splt, recall_score(labels[splt],
                                                                splt_preds))),
        print("{} macro recall: {:.4f}".format(splt, recall_score(labels[splt],
                                                                splt_preds, 
                                                                average="macro"))),
        print("{} macro f1: {:.4f}".format(
            splt, f1_score(labels[splt], splt_preds, average='macro')))

    return best_model


In [57]:
fit_and_eval_logistic_regression(feature_name='unigram_binary',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.9650
val binary recall: 0.9843
val macro recall: 0.9652
val macro f1: 0.9650
test accuracy: 0.9631
test binary recall: 0.9854
test macro recall: 0.9630
test macro f1: 0.9631


LogisticRegression(fit_intercept=False, max_iter=200, random_state=42)

In [58]:
fit_and_eval_logistic_regression(feature_name='unigram_count',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.9641
val binary recall: 0.9847
val macro recall: 0.9644
val macro f1: 0.9641
test accuracy: 0.9630
test binary recall: 0.9868
test macro recall: 0.9629
test macro f1: 0.9629


LogisticRegression(fit_intercept=False, max_iter=200, random_state=42)

In [59]:
fit_and_eval_logistic_regression(feature_name='bigram_binary',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.4971
val binary recall: 0.9966
val macro recall: 0.5040
val macro f1: 0.3421
test accuracy: 0.5035
test binary recall: 0.9951
test macro recall: 0.5028
test macro f1: 0.3441


LogisticRegression(fit_intercept=False, max_iter=200, random_state=42)

In [60]:
fit_and_eval_logistic_regression(feature_name='bigram_count',
                                 data_dir=Path(data_dir),
                                 tune=False)

val accuracy: 0.4971
val binary recall: 0.9966
val macro recall: 0.5040
val macro f1: 0.3421
test accuracy: 0.5035
test binary recall: 0.9951
test macro recall: 0.5028
test macro f1: 0.3441


LogisticRegression(fit_intercept=False, max_iter=200, random_state=42)

In [61]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/unigram_vocab.json"

model_trained: LogisticRegression = fit_and_eval_logistic_regression(
    feature_name='unigram_binary', data_dir=Path(data_dir), tune=False)
weights = model_trained.coef_[0]
vocab = json.load(open(vocab_filepath))
print("")
print_important_weights(weights=weights, words=vocab.keys())

val accuracy: 0.9650
val binary recall: 0.9843
val macro recall: 0.9652
val macro f1: 0.9650
test accuracy: 0.9631
test binary recall: 0.9854
test macro recall: 0.9630
test macro f1: 0.9631

Most hateful words:
 5.9076 | allahsoil
 4.2857 | racism
 4.0006 | bigot
 3.4879 | equality
 3.4789 | white
 3.3869 | blacklivesmatter
 3.3394 | neighbors
 3.2137 | mc
 3.2048 | blatantly
 3.2015 | shitty

Least hateful words:
-3.9929 | bihday
-3.2502 | orlando
-3.1340 | hardcore
-2.8810 | thankful
-2.7886 | healthy
-2.7294 | friday
-2.7153 | getting
-2.6452 | day
-2.6402 | tomorrow
-2.5961 | followers

Most neutral words:
 0.0000 | <pad>
 0.0000 | elder
 0.0000 | litter
 0.0000 | vaccines
 0.0000 | devoted
 0.0000 | rooting
 0.0000 | minime
 0.0000 | mornin
 0.0000 | ankara
 0.0000 | pl
