In [1]:
#!pip install tensorflow_text==2.5.0.
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import mode
import re
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [34]:
twitter = pd.read_csv("labeled_tweet_table.csv")
twitter = twitter[twitter["Race"].notna() & twitter["Tweet"].notna()]
twitter = twitter[twitter["Race"] != 5]
twitter["Race"] -= 1
twitter["Race"].value_counts()

3.0    242886
0.0     28719
1.0     17797
2.0      9885
Name: Race, dtype: int64

In [35]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"http\S+" : ""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        try:
            t = re.sub(regx, regexMap[regx], t)
        except TypeError:
            continue
    return t
twitter["Tweet"] = twitter["Tweet"].apply(preprocess)
twitter

Unnamed: 0,Tweet,Name,Screen Name,Description,Lang,img_path,Race,Under 21
0,"YKAR, a futuristic sans serif font by #Freeb...",Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
1,Who can I contact about the very rude and poo...,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
2,I’d like to win!,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
3,,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
4,Now Im heading to B1000th Floor! #quickrogue,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
...,...,...,...,...,...,...,...,...
313398,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313399,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313400,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313401,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0


In [36]:
names = twitter["Name"].unique()
labels = []
tweet_dict = {}
for name in names:
    tweet_dict[name] = twitter[twitter["Name"] == name]["Tweet"].tolist()

In [37]:
label_dict = twitter.set_index("Name").to_dict()["Race"]

In [38]:
concatenated_tweets = []
labels = []
for name in names:
    concat = ' '.join([str(x) for x in tweet_dict[name]])
    concatenated_tweets.append(concat)
    labels.append(label_dict[name])

concat_df = pd.DataFrame({"User": np.array(names), "Tweet": concatenated_tweets, "Race": np.array(labels)})
len(names), len(concatenated_tweets), len(labels)

(3056, 3056, 3056)

In [39]:
concat_df["Race"].value_counts()

3.0    2475
0.0     298
1.0     181
2.0     102
Name: Race, dtype: int64

In [40]:
accs = []
reports = []
matrices = []
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
log = LogisticRegression(class_weight = 'balanced', max_iter = 10 ** 6)
kf = KFold(n_splits = 5)
i = 1
for train, test in kf.split(concat_df):
    train_df = concat_df.iloc[train]
    test_df = concat_df.iloc[test]
    print("Fold %d:\n" % i)
    i += 1
    x_train = train_df["Tweet"]
    y_train = train_df["Race"]
    x_test = test_df["Tweet"]
    y_test = test_df["Race"]
    tfidf.fit(x_train)
    x_train = tfidf.transform(x_train)
    x_test = tfidf.transform(x_test)
    
    log.fit(x_train, y_train)
    y_pred = log.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    print("Accuracy: %.3f\n" % acc)
    accs.append(acc)
    
    report = classification_report(y_test, y_pred)
    print(report)
    reports.append(classification_report(y_test, y_pred, output_dict=True))
    print()
    
    confusion = confusion_matrix(y_test, y_pred, normalize = 'true', labels = [0, 1, 2, 3])
    print(confusion)
    print()
    matrices.append(confusion)
    

Fold 1:

Accuracy: 0.750

              precision    recall  f1-score   support

         0.0       0.37      0.35      0.36        52
         1.0       0.11      0.11      0.11        36
         2.0       0.00      0.00      0.00        15
         3.0       0.86      0.86      0.86       509

    accuracy                           0.75       612
   macro avg       0.33      0.33      0.33       612
weighted avg       0.75      0.75      0.75       612


[[0.34615385 0.03846154 0.         0.61538462]
 [0.11111111 0.11111111 0.05555556 0.72222222]
 [0.06666667 0.         0.         0.93333333]
 [0.05108055 0.0589391  0.03143418 0.85854617]]

Fold 2:

Accuracy: 0.758

              precision    recall  f1-score   support

         0.0       0.43      0.64      0.51        67
         1.0       0.31      0.31      0.31        45
         2.0       0.33      0.19      0.24        21
         3.0       0.89      0.84      0.86       478

    accuracy                           0.76       

In [41]:
print("Overall accuracy: %.3f" % np.average(accs))
print()
for category in [0.0, 1.0, 2.0, 3.0]:
    print("Class %d:" % int(category))
    print("Precision: %.3f" % np.average([x[str(category)]["precision"] for x in reports]))
    print("Recall: %.3f" % np.average([x[str(category)]["recall"] for x in reports]))
    print("F1-score: %.3f" % np.average([x[str(category)]["f1-score"] for x in reports]))
    print("Support: %.3f" % np.average([x[str(category)]["support"] for x in reports]))
    print()
    
print("Confusion Matrix:\n")
print(sum(matrices) / 5)

Overall accuracy: 0.760

Class 0:
Precision: 0.421
Recall: 0.537
F1-score: 0.470
Support: 59.600

Class 1:
Precision: 0.197
Recall: 0.215
F1-score: 0.204
Support: 36.200

Class 2:
Precision: 0.117
Recall: 0.081
F1-score: 0.094
Support: 20.400

Class 3:
Precision: 0.880
Recall: 0.853
F1-score: 0.866
Support: 495.000

Confusion Matrix:

[[0.53733716 0.05841227 0.01384222 0.39040835]
 [0.16159933 0.21506734 0.05114478 0.57218855]
 [0.07999063 0.11220127 0.08057015 0.72723795]
 [0.07265234 0.05007569 0.02411579 0.85315619]]


## Below this is failed attempts with oversampling, BERT, and keras. These attempts produced high accuracy but very poor recall.

In [None]:
labeled_users = pd.read_csv("labeled_users.csv")
labeled_users["race"].value_counts()

In [None]:
x = concat_df
y = concat_df['Race']

In [None]:
x.drop('Race',inplace=True,axis=1)
x

In [None]:
#!pip install imblearn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)

x_ros, y_ros = ros.fit_resample(x, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
df = x_ros
df['Race'] = y_ros
df

In [2]:
def constructModel():
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    text_input = keras.layers.Input(shape=(), dtype=tf.string)
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    dense = keras.layers.Dense(4, activation='softmax')(outputs["pooled_output"])

    model = keras.Model(inputs=[text_input], outputs=[dense])
    model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    model.summary()
    return model

model = constructModel()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['input_1[0][0]']                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [None]:
accs = []
reports = []
confusion_matrices = []

kf = KFold(n_splits = 5)
i = 1
for train, test in kf.split(concat_df):
    
    print("Fold %d:" % i)
    print()
    i += 1
    train_df = df.iloc[train]
    test_df = df.iloc[test]
    
    x_train, y_train = train_df["Tweet"], train_df["Race"]
    x_test, y_test = test_df["Tweet"], test_df["Race"]
    
    model.fit(x_train, y_train, epochs = 5)
    y_pred = np.argmax(model.predict(x_test), axis=1)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)
    print("Accuracy: ", acc)
    print()
    report = classification_report(y_test, y_pred)
    print(report)
    print()
    reports.append(report)
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    print(cm)
    print()
    confusion_matrices.append(cm)


In [None]:
np.average(accs)

In [None]:
sum(confusion_matrices) / 5