In [1]:
from collections import Counter
import json
from pathlib import Path
from nltk.tokenize import WordPunctTokenizer
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

#### Create vocab from training data

In [42]:
data_train_df.shape

(31962, 8)

In [41]:
data_train_df = pd.read_csv('data/Twitter/hate_twitter/train_clean.csv')
train, val, test = np.split(data_train_df.sample(frac=1, random_state=8),\
    [int(0.7*len(data_train_df)),int(0.85*len(data_train_df))])
print(train.shape)
print(val.shape)
print(test.shape)

(22373, 8)
(4794, 8)
(4795, 8)


In [43]:
train.to_csv('data/Twitter/hate_twitter/hate_train.csv')
val.to_csv('data/Twitter/hate_twitter/hate_val.csv')
test.to_csv('data/Twitter/hate_twitter/hate_test.csv')

In [44]:
train_df = pd.read_csv('data/Twitter/hate_twitter/hate_train.csv')

In [45]:
data_train = list(train_df[train_df['clean_tweet'].notna()]['clean_tweet'])
data_train[0:5]

['omg omg omg yay found wonderful price segasaturn throwback',
 'payintheusa polar bear climb racing angry polar bear climb racing polar bear living cold place',
 'trainhard polar bear climb racing angry polar bear climb racing polar bear living cold places lo',
 'turn resignation',
 'happy bihday hajime hosogai bihday bihday 30']

In [48]:
# funky behavior with bad tweets - do we want to capture some of these?
train_df[train_df['clean_tweet'].isna()][0:5]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,label,tweet,hash_tag,clean_tweet,tokenized_tweet,tokenized_tweet_NLTK
875,21970,21970,21971,0,@user not to me.,[],,<user> not to me.,@user
2122,15494,15494,15495,0,@user @user me to!,[],,<user> <user> me to!,@user @user
3865,16320,16320,16321,0,@user @user ð very,[],,<user> <user> ð very,@user @user
4114,5028,5028,5029,0,@user as you should.,[],,<user> as you should.,@user
5241,15434,15434,15435,0,ðð» that is all....,[],,ðð» that is all. <repeat>,


In [49]:
data_dir = Path('data/Twitter/hate_twitter/')  # Modify the path of `data_dir` as needed.
tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])


# create unigram vocab
for i, line in enumerate(data_train):
    #print(i)
    tokens = tokenizer.tokenize(line)
    tokens = [t.lower() for t in tokens]
    counter.update(tokens)
    if i < 3:
        print(f"String of line {i}: {line.strip()}")
        print(f"Tokens of line {i}: {tokens}")
counter = dict(counter)

vocab = {}
# Populate the vocabulary with words that appear at least 3 times.
for word, freq in counter.items():
    if freq < 3 and word not in ['<pad>', '<unk>']:
        continue
    vocab[word] = len(vocab)

output_filepath = data_dir.joinpath('unigram_vocab.json')
json.dump(vocab, open(output_filepath, mode='w'))

String of line 0: omg omg omg yay found wonderful price segasaturn throwback
Tokens of line 0: ['omg', 'omg', 'omg', 'yay', 'found', 'wonderful', 'price', 'segasaturn', 'throwback']
String of line 1: payintheusa polar bear climb racing angry polar bear climb racing polar bear living cold place
Tokens of line 1: ['payintheusa', 'polar', 'bear', 'climb', 'racing', 'angry', 'polar', 'bear', 'climb', 'racing', 'polar', 'bear', 'living', 'cold', 'place']
String of line 2: trainhard polar bear climb racing angry polar bear climb racing polar bear living cold places lo
Tokens of line 2: ['trainhard', 'polar', 'bear', 'climb', 'racing', 'angry', 'polar', 'bear', 'climb', 'racing', 'polar', 'bear', 'living', 'cold', 'places', 'lo']


In [50]:
tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])

# create bigram vocab
for i, line in enumerate(data_train):
    tokens = tokenizer.tokenize(line)
    # tokens = line.split()
    tokens = [t.lower() for t in tokens]
    tokens = [t if t in vocab else '<unk>' for t in tokens]
    counter.update([tokens[i] + " " + tokens[i + 1] for i in range(len(tokens) - 1)])
counter = dict(counter)
print(f"Vocab size before frequency filtering: {len(counter)}")

for word, freq in list(counter.items()):
    if freq < 3:
        continue
    vocab[word] = len(vocab)

print(f"Vocab size after frequency filtering: {len(vocab)}")
output_filepath = data_dir.joinpath('bigram_vocab.json')
json.dump(vocab, open(output_filepath, mode='w'))

Vocab size before frequency filtering: 82684
Vocab size after frequency filtering: 15495


#### Extract features from training data

In [51]:
def extract_features(vocab, data_dir, feature_field, tokenizer, feature_name):
    """
    Extract and save different features based on vocab of the features.
    # Parameters
    vocab : `dict[str, int]`, required.
        A map from the word type to the index of the word.
    data_dir : `Path`, required.
        Directory of the dataset
    tokenizer : `Callable`, required.
        Tokenizer with a method `.tokenize` which returns list of tokens.
    feature_name : `str`, required.
        Name of the feature, such as unigram_binary.
    # Returns
        `None`
    """
    # Extract and save the vocab and features.

    data_dir = Path(data_dir)
    splits = ['train','test','val']
    #splits = ['train']

    gram, mode = feature_name.split('_')
    if gram not in ['unigram', 'bigram'] or mode not in ['binary', 'count']:
        raise NotImplementedError

    for split in splits:
        datapath = data_dir.joinpath(f'hate_{split}.csv')
        print('datapath',datapath)
        data_df = pd.read_csv(datapath)
        data_df = data_df[data_df[feature_field].notna()]
        print('data df cols',data_df.columns)
        features = list(data_df[feature_field])
        
        sent_lengths = []
        values, rows, cols = [], [], []
        labels = list(data_df['label'])
        print(f"\nExtract {gram} {mode} features from {datapath}")
        for i, line in enumerate(features):
            if i % 1000 == 1:
                print(f"Processing {i}/{len(features)} row")
            #label = int(line[0])
            tokens = tokenizer.tokenize(line.strip())
            # tokens = line[1:].strip().split(  )  # Tokenizing differently affects the results.
            tokens = [t.lower() for t in tokens]
            tokens = [t if t in vocab else '<unk>' for t in tokens]
            if gram.find('bigram') != -1:
                tokens.extend(
                    [tokens[i] + ' ' + tokens[i + 1] for i in range(len(tokens) - 1)])
            feature = {}
            for tk in tokens:
                if tk not in vocab:
                    continue
                if mode == 'binary':
                    feature[vocab[tk]] = 1
                elif mode == 'count':
                    feature[vocab[tk]] = feature.get(vocab[tk], 0) + 1
                else:
                    raise NotImplementedError
            for j in feature:
                values.append(feature[j])
                rows.append(i)
                cols.append(j)
            sent_lengths.append(len(tokens))
            #labels.append(label)

        features = sparse.csr_matrix((values, (rows, cols)),
                                     shape=(len(features), len(vocab)))
        print(f"{split} feature matrix shape: {features.shape}")
        output_feature_filepath = data_dir.joinpath(f'{split}_{gram}_{mode}_features.npz')
        sparse.save_npz(output_feature_filepath, features)

        np_labels = np.asarray(labels)
        print(f"{split} label array shape: {np_labels.shape}")
        output_label_filepath = data_dir.joinpath(f'{split}_labels.npz')
        np.savez(output_label_filepath, np_labels)


In [52]:
data_dir = "data/Twitter/hate_twitter/"
vocab_filepath = "data/Twitter/hate_twitter/unigram_vocab.json"

extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='unigram_binary')

datapath data/Twitter/hate_twitter/train_clean.csv
data df cols Index(['Unnamed: 0', 'id', 'label', 'tweet', 'hash_tag', 'clean_tweet',
       'tokenized_tweet', 'tokenized_tweet_NLTK'],
      dtype='object')

Extract unigram binary features from data/Twitter/hate_twitter/train_clean.csv
Processing 1/31932 row
Processing 1001/31932 row
Processing 2001/31932 row
Processing 3001/31932 row
Processing 4001/31932 row
Processing 5001/31932 row
Processing 6001/31932 row
Processing 7001/31932 row
Processing 8001/31932 row
Processing 9001/31932 row
Processing 10001/31932 row
Processing 11001/31932 row
Processing 12001/31932 row
Processing 13001/31932 row
Processing 14001/31932 row
Processing 15001/31932 row
Processing 16001/31932 row
Processing 17001/31932 row
Processing 18001/31932 row
Processing 19001/31932 row
Processing 20001/31932 row
Processing 21001/31932 row
Processing 22001/31932 row
Processing 23001/31932 row
Processing 24001/31932 row
Processing 25001/31932 row
Processing 26001/31932

KeyError: 'label'

In [40]:
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='unigram_count')
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='bigram_count')
extract_features(vocab=json.load(open(vocab_filepath)),
                 tokenizer=tokenizer,
                 feature_field="clean_tweet",
                 data_dir=data_dir,
                 feature_name='bigram_binary')

datapath data/Twitter/hate_twitter/train_clean.csv
data df cols Index(['Unnamed: 0', 'id', 'label', 'tweet', 'hash_tag', 'clean_tweet',
       'tokenized_tweet', 'tokenized_tweet_NLTK'],
      dtype='object')

Extract unigram count features from data/Twitter/hate_twitter/train_clean.csv
Processing 1/31932 row
Processing 1001/31932 row
Processing 2001/31932 row
Processing 3001/31932 row
Processing 4001/31932 row
Processing 5001/31932 row
Processing 6001/31932 row
Processing 7001/31932 row
Processing 8001/31932 row
Processing 9001/31932 row
Processing 10001/31932 row
Processing 11001/31932 row
Processing 12001/31932 row
Processing 13001/31932 row
Processing 14001/31932 row
Processing 15001/31932 row
Processing 16001/31932 row
Processing 17001/31932 row
Processing 18001/31932 row
Processing 19001/31932 row
Processing 20001/31932 row
Processing 21001/31932 row
Processing 22001/31932 row
Processing 23001/31932 row
Processing 24001/31932 row
Processing 25001/31932 row
Processing 26001/31932 

In [None]:
def fit_and_eval_logistic_regression(data_dir: Path,
                                     feature_name: str,
                                     tune: bool = False) -> LogisticRegression:
    """
    Fit and evaluate the logistic regression model using the scikit-learn library.
    # Parameters
    data_dir : `Path`, required
        The data directory.
    feature_name : `str`, required.
        Name of the feature, such as unigram_binary.
    tune : `bool`, optional
        Whether or not to tune the hyperparameters of the regularization strength
        of the model of the [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
    # Returns
        model_trained: `LogisticRegression`
            The object of `LogisticRegression` after it is trained.
    """
    # Implement logistic regression with scikit-learn.
    # Print out the accuracy scores on dev and test data.

    #splits = ['train', 'test']
    splits = ['train']
    features, labels = {}, {}

    for split in splits:
        features_path = data_dir.joinpath(f'{split}_{feature_name}_features.npz')
        labels_path = data_dir.joinpath(f'{split}_labels.npz')
        features[split] = sparse.load_npz(features_path)
        labels[split] = np.load(labels_path)['arr_0']
    best_dev, best_model = 0, None
    if tune:
        for c in np.linspace(-5, 5, 11):
            clf = LogisticRegression(random_state=42,
                                     max_iter=100,
                                     fit_intercept=False,
                                     C=np.exp2(c))
            clf.fit(features['train'], labels['train'])
            dev_preds = clf.predict(features['dev'])
            dev_accuracy = accuracy_score(labels['dev'], dev_preds)
            print(c, dev_accuracy)
            if dev_accuracy > best_dev:
                best_dev, best_model = dev_accuracy, clf
    else:
        best_model = LogisticRegression(random_state=42,
                                        max_iter=100,
                                        fit_intercept=False)
        best_model.fit(features['train'], labels['train'])

    preds = {
        'dev': best_model.predict(features['dev']),
        'test': best_model.predict(features['test'])
    }
    for splt, splt_preds in preds.items():
        print("{} accuracy: {:.4f}".format(splt, accuracy_score(labels[splt],
                                                                splt_preds)))
        print("{} macro f1: {:.4f}".format(
            splt, f1_score(labels[splt], splt_preds, average='macro')))

    return best_model
