Mount Google Drive so we can use our files.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


First, we simply read in our corpus.

In [None]:
import pandas as pd

tsk = pd.read_csv('/content/drive/My Drive/gab data/GabHateCorpus_annotations.tsv', sep='\t')

# HD = assaults on human dignity
# CV = calls to violence
# VO = vulgar or offensive language

tsk

Unnamed: 0,ID,Annotator,Text,Hate,HD,CV,VO,REL,RAE,SXO,GEN,IDL,NAT,POL,MPH,EX,IM
0,27044,4,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
1,27044,15,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
2,27044,10,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
3,27044,8,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
4,27045,4,Get the new Android app update released today ...,0,0,0,0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86524,9188,6,"He thinks only peons own guns, you're supposed...",1,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
86525,9188,3,"He thinks only peons own guns, you're supposed...",0,0,0,0,,,,,,,,,,
86526,9222,11,America must stop funding the United Nations.,0,0,0,0,,,,,,,,,,
86527,9222,3,America must stop funding the United Nations.,0,0,0,0,,,,,,,,,,


Now we'll take 80% of the data and make it our "training set", and the other 20% will be our "test set".

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tsk['Text'], tsk['Hate'], test_size=0.20, random_state=240)

y_test.values

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers.core import Dense, Dropout
from nltk.corpus import stopwords
import nltk
from tensorflow.python.keras import models, layers

nltk.download("stopwords")

features = 5000

vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b', max_features=features)

vectorizer.fit(x_train.values)
tfidf = vectorizer.transform(x_train.values)

tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())
tfidf


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,abandoned,abc,abiding,ability,able,aborted,abortion,abortions,absolute,absolutely,abuse,abused,accept,acceptable,accepted,access,accident,accomplish,accomplished,according,account,accountable,accounts,acct,accurate,accusations,accuse,accused,accuses,acosta,across,act,acting,action,actions,active,actively,activist,activists,activity,...,wtf,wwii,www,xd,xi,ya,yahoo,yard,ye,yea,yeah,year,years,yellow,yep,yes,yesterday,yet,yo,york,young,younger,youre,youth,youtu,youtube,yr,yrs,yup,zealand,zero,zerohedge,zimbabwe,zionist,zionists,zog,zombie,zone,zones,zuckerberg
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.125925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.293743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69218,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69219,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69220,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69221,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
model = models.Sequential()

model.add(layers.Dense(1024, activation='relu', input_shape=(features,)))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])

model.fit(tfidf.values, y_train.values, epochs=2, batch_size=350)

results = model.evaluate(vectorizer.transform(x_test.values).toarray(), y_test.values)

results

# for new data, vectorize the data the same way
# model.predict()

Epoch 1/2
Epoch 2/2


[0.3365194499492645, 0.8734542727470398]

In [None]:
# convert the data into something useful (currently json)
import json
import pandas as pd

ohbaby = pd.read_json('/content/drive/My Drive/gab data/maga_after_10_01_2020', lines=True)

ohbaby.drop_duplicates(subset='content')

ohbaby

Unnamed: 0,id,created_at,revised_at,in_reply_to_id,in_reply_to_account_id,sensitive,spoiler_text,visibility,language,uri,url,replies_count,reblogs_count,pinnable,pinnable_by_group,favourites_count,quote_of_id,expires_at,has_quote,bookmark_collection_id,favourited,reblogged,content,rich_content,plain_markdown,reblog,quote,account,group,media_attachments,mentions,tags,emojis,card,poll
0,106027103000843760,2021-04-08 01:26:39.598000+00:00,NaT,,,False,,public,en,/QueueMeInToo/posts/106027103000843767,https://gab.com/QueueMeInToo/posts/10602710300...,0,0,False,False,0,,NaT,False,,False,False,"Daniel Hale<br /><br /><a href=""https://stateo...",,,,,"{'id': '3039669', 'username': 'QueueMeInToo', ...",,"[{'id': '70980926', 'type': 'image', 'url': 'h...",[],"[{'name': 'wwg1wga', 'url': '/tags/wwg1wga'}, ...",[],"{'id': 10884380, 'url': 'https://stateofthenat...",
1,106027089360096544,2021-04-08 01:23:11.464000+00:00,NaT,,,False,,public,en,/FaithSpiritual1/posts/106027089360096537,https://gab.com/FaithSpiritual1/posts/10602708...,0,0,False,False,0,,NaT,False,,False,False,Been cleaning out my store building the last f...,,,,,"{'id': '2145853', 'username': 'FaithSpiritual1...",,[],[],"[{'name': 'resistjoebiden', 'url': '/tags/resi...",[],,
2,106027086542845600,2021-04-08 01:22:28.486000+00:00,NaT,,,False,,public,en,/myerikd/posts/106027086542845606,https://gab.com/myerikd/posts/106027086542845606,0,0,False,True,0,,NaT,False,,False,False,Asa Hutchinson’s Trans-Bill Veto Draws Scrutin...,,,,,"{'id': '185312', 'username': 'myerikd', 'acct'...","{'id': '692', 'title': 'News', 'description': ...",[],"[{'id': '1614939', 'username': 'Aliatra', 'url...","[{'name': 'stopthesteal', 'url': '/tags/stopth...",[],"{'id': 10881349, 'url': 'https://www.breitbart...",
3,106027085727875488,2021-04-08 01:22:16.035000+00:00,NaT,,,False,,public,en,/VIEWZUU/posts/106027085727875480,https://gab.com/VIEWZUU/posts/106027085727875480,1,2,False,True,4,,NaT,False,,False,False,"They tried and failed, Joe.<br />You and your ...",,,,,"{'id': '851590', 'username': 'VIEWZUU', 'acct'...","{'id': '22', 'title': '/g/The_Donald', 'descri...","[{'id': '70980691', 'type': 'image', 'url': 'h...",[],"[{'name': 'joebidenisnotmypresident', 'url': '...",[],,
4,106027081437902496,2021-04-08 01:21:10.575000+00:00,NaT,,,False,,public,,/QueueMeInToo/posts/106027081437902503,https://gab.com/QueueMeInToo/posts/10602708143...,0,1,False,False,1,,NaT,False,,False,False,"<a href=""/tags/MAGA"" class=""mention hashtag"" r...",,,,,"{'id': '3039669', 'username': 'QueueMeInToo', ...",,"[{'id': '70980628', 'type': 'image', 'url': 'h...",[],"[{'name': 'wwg1wga', 'url': '/tags/wwg1wga'}, ...",[],,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60595,105638959438612416,2021-01-29 12:16:38.847000+00:00,NaT,,,False,,public,en,/OpenQuotes/posts/105638959438612411,https://gab.com/OpenQuotes/posts/1056389594386...,7,4,False,False,7,,NaT,False,,False,False,What's the definition of Racism again?<br /><b...,,,,,"{'id': '98146', 'username': 'OpenQuotes', 'acc...",,"[{'id': '63670901', 'type': 'image', 'url': 'h...",[],"[{'name': 'teamtrump', 'url': '/tags/teamtrump...",[],"{'id': 8359995, 'url': 'https://reddit.com/r/w...",
60596,105638954869564768,2021-01-29 12:15:29.128000+00:00,NaT,,,False,,public,de,/lucrum-sanguinem/posts/105638954869564760,https://gab.com/lucrum-sanguinem/posts/1056389...,0,0,False,False,1,,NaT,False,,False,False,"🎺✨🎺 <a href=""/tags/USA"" class=""mention hashtag...",,,,,"{'id': '110116', 'username': 'lucrum-sanguinem...",,"[{'id': '63670789', 'type': 'image', 'url': 'h...",[],"[{'name': 'trusttheplan', 'url': '/tags/trustt...",[],,
60597,105638954702671504,2021-01-29 12:15:26.954000+00:00,NaT,,,False,,public,en,/follownoone/posts/105638954702671503,https://gab.com/follownoone/posts/105638954702...,0,0,False,False,0,,NaT,False,,False,False,🇺🇸 🇺🇸 RISE AND SHINE PATRIOTS 🇺🇸 🇺🇸<br /><br /...,,,,,"{'id': '865126', 'username': 'follownoone', 'a...",,"[{'id': '63658014', 'type': 'image', 'url': 'h...",[],"[{'name': 'goarmy', 'url': '/tags/goarmy'}, {'...",[],,
60598,105638946236500304,2021-01-29 12:13:17.394000+00:00,NaT,,,False,,public,en,/lucrum-sanguinem/posts/105638946236500297,https://gab.com/lucrum-sanguinem/posts/1056389...,0,0,False,False,1,,NaT,False,,False,False,"🎺✨🎺 <a href=""/tags/USA"" class=""mention hashtag...",,,,,"{'id': '110116', 'username': 'lucrum-sanguinem...",,"[{'id': '63670524', 'type': 'image', 'url': 'h...",[],"[{'name': 'trusttheplan', 'url': '/tags/trustt...",[],,


In [None]:
# make the text all pretty and nice
import re
!pip install emoji
import emoji

def give_emoji_free_text(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  cleantext = give_emoji_free_text(cleantext)
  return cleantext


clean = [cleanhtml(x) for x in ohbaby['content'].values]

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |██▌                             | 10kB 13.2MB/s eta 0:00:01[K     |█████                           | 20kB 8.1MB/s eta 0:00:01[K     |███████▌                        | 30kB 4.8MB/s eta 0:00:01[K     |██████████                      | 40kB 4.7MB/s eta 0:00:01[K     |████████████▌                   | 51kB 2.9MB/s eta 0:00:01[K     |███████████████                 | 61kB 3.3MB/s eta 0:00:01[K     |█████████████████▌              | 71kB 3.5MB/s eta 0:00:01[K     |████████████████████            | 81kB 3.6MB/s eta 0:00:01[K     |██████████████████████▌         | 92kB 3.6MB/s eta 0:00:01[K     |█████████████████████████       | 102kB 3.7MB/s eta 0:00:01[K     |███████████████████████████▌    | 112kB 3.7MB/s eta 0:00:01[K     |██████████████████████████████  | 122kB 3.7MB/s eta 0:00:0

In [None]:
ridf = vectorizer.transform(clean)

ridf = pd.DataFrame(ridf.toarray(), columns=vectorizer.get_feature_names())

res = model.predict(ridf.values)

res

array([[0.01144329],
       [0.02641803],
       [0.08204904],
       ...,
       [0.00671065],
       [0.28727356],
       [0.14241445]], dtype=float32)

In [None]:
# res now contains our scores.
#NOTE: ohbaby['created_at'] contains the times of each post, ohbaby['content'] has the text and res contains the score.

# import numpy as np 
# import matplotlib.pyplot as plt
# from matplotlib.pyplot import figure

# figure(figsize=(8, 6), dpi=600)

# plt.title("Line graph") 
# plt.xlabel("Date") 
# plt.ylabel("Likelihood of it being hate") 
# plt.plot(ohbaby['created_at'], res, color ="red") 
# plt.show()

# import numpy

# a = np.array([ohbaby['content'].tolist(), ohbaby['created_at'].tolist(), res.tolist()])
# numpy.savetxt("output.log", a, delimiter='\t', header="Text,Date,Score", comments="", fmt="%s")


df = pd.DataFrame({"Text" : ohbaby['content'], "Date" : ohbaby['created_at'], "Score" : res.flatten()})
df.to_csv("submission2.csv", index=False)

# pd.DataFrame(a).to_csv('new_out.tsv', sep='\t')

In [None]:
print("Max:", res.max())
print("Min:", res.min())
print("Mean:", res.mean())
print("Median:", res.median())

Max: 0.6038654
Min: 4.0496573e-07
Mean: 0.090283535


AttributeError: ignored