# Cleaning

Removing some unwanted target words from the allusion database.

In [1]:
import sqlite3
connection = sqlite3.connect("wiki_to_use.db")
cursor = connection.cursor()

**Get rid of everything but `VERB`,`NOUN`, and `ADJ` targets.**

In [2]:
cursor.execute("SELECT target,target_pos FROM allusions")
rows = cursor.fetchall()

word_pos_tups = []

for r in rows:
    word_pos_tups.append(r)

In [3]:
len(word_pos_tups)

2255871

In [4]:
word_pos_tups[:5]

[('child', 'NOUN'),
 ('ease', 'VERB'),
 ('Modest', 'ADJ'),
 ('juvenalian', 'ADJ'),
 ('satirical', 'ADJ')]

In [5]:
all_target_tags = list(set([tag for word,tag in word_pos_tups]))
all_target_tags

['NUM',
 'ADV',
 'ADP',
 'NOUN',
 'AUX',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'VERB',
 'SYM',
 'CCONJ',
 'ADJ',
 'DET',
 'PART',
 'INTJ',
 'PRON',
 'X']

In [6]:
for tag in all_target_tags:
    if tag not in ["ADJ","NOUN","VERB"]:
        cursor.execute("DELETE from allusions WHERE target_pos=?", (tag,))
connection.commit()

**Get rid of everything that isn't in wordnet with the correct part of speech.**

In [7]:
cursor.execute("SELECT target,target_pos FROM allusions")
rows = cursor.fetchall()

word_pos_tups = []

for r in rows:
    word_pos_tups.append(r)

In [8]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/kyle/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
from nltk.corpus import wordnet

In [10]:
#wordnet.synsets('dog','p')

In [11]:
word_pos_tups[:10]

[('child', 'NOUN'),
 ('ease', 'VERB'),
 ('Modest', 'ADJ'),
 ('juvenalian', 'ADJ'),
 ('satirical', 'ADJ'),
 ('british', 'ADJ'),
 ('juvenalian', 'ADJ'),
 ('satirical', 'ADJ'),
 ('essay', 'NOUN'),
 ('papist', 'NOUN')]

In [12]:
%%time

to_delete_based_on_wordnet = []
for word,tag in list(set(word_pos_tups)):
    synsets = wordnet.synsets(word.lower(), pos=tag.lower()[0])
    if len(synsets)==0:
        to_delete_based_on_wordnet.append((word,tag))
#         cursor.execute("DELETE from allusions WHERE (target=? AND target_pos=?)", (word,tag))
#connection.commit()

CPU times: user 3.68 s, sys: 59.1 ms, total: 3.74 s
Wall time: 3.76 s


In [13]:
len(to_delete_based_on_wordnet)

18044

In [14]:
from tqdm import tqdm

In [15]:
## huh...outrageously slow to delete from sqlite
## should not have used sqlite
## should have used a dictionary
## too late now, maybe change later
for i in tqdm(to_delete_based_on_wordnet):
    synsets = wordnet.synsets(word.lower(), pos=tag.lower()[0])
    cursor.execute("DELETE from allusions WHERE (target=? AND target_pos=?)", i)

connection.commit()

100%|█████████████████████████████████████| 18044/18044 [53:48<00:00,  5.59it/s]


**Stopwords+**

In [16]:
from nltk import corpus
from nltk import FreqDist

bc = corpus.brown.words()

from nltk import pos_tag
bc_tagged = pos_tag(bc)

bc_nouns = [w.lower() for w,t in bc_tagged if t=="NN"]
bc_adjectives = [w.lower() for w,t in bc_tagged if t=="JJ"]
bc_verbs = [w.lower() for w,t in bc_tagged if t in ["VB","VBZ"]]

In [17]:
bad_words = [w for w,t in FreqDist([word.lower() for word in bc]).most_common(300)]
print(bad_words)

['the', ',', '.', 'of', 'and', 'to', 'a', 'in', 'that', 'is', 'was', 'he', 'for', '``', "''", 'it', 'with', 'as', 'his', 'on', 'be', ';', 'at', 'by', 'i', 'this', 'had', '?', 'not', 'are', 'but', 'from', 'or', 'have', 'an', 'they', 'which', '--', 'one', 'you', 'were', 'her', 'all', 'she', 'there', 'would', 'their', 'we', 'him', 'been', ')', 'has', '(', 'when', 'who', 'will', 'more', 'if', 'no', 'out', 'so', 'said', 'what', 'up', 'its', 'about', ':', 'into', 'than', 'them', 'can', 'only', 'other', 'new', 'some', 'could', 'time', '!', 'these', 'two', 'may', 'then', 'do', 'first', 'any', 'my', 'now', 'such', 'like', 'our', 'over', 'man', 'me', 'even', 'most', 'made', 'also', 'after', 'did', 'many', 'before', 'must', 'af', 'through', 'back', 'years', 'where', 'much', 'your', 'way', 'well', 'down', 'should', 'because', 'each', 'just', 'those', 'people', 'mr.', 'too', 'how', 'little', 'state', 'good', 'very', 'make', 'world', 'still', 'see', 'own', 'men', 'work', 'long', 'here', 'get', 'both

In [18]:
bad_nouns = [w for w,count in FreqDist(bc_nouns).most_common(50)]
print(bad_nouns)

['time', 'man', 'way', 'world', 'life', 'year', 'day', 'work', 'state', 'home', 'place', 'part', 'number', 'course', 'fact', 'water', 'something', 'hand', 'school', 'head', 'house', 'night', 'nothing', 'system', 'group', 'program', 'order', 'business', 'room', 'side', 'use', 'end', 'case', 'point', 'thing', 'power', 'interest', 'face', 'area', 'country', 'problem', 'family', 'kind', 'development', 'door', 'war', 'sense', 'form', 'matter', 'action']


In [19]:
bad_adjectives = [w for w,count in FreqDist(bc_adjectives).most_common(50)]
print(bad_adjectives)

['other', 'such', 'new', 'first', 'many', 'good', 'little', 'own', 'same', 'last', 'great', 'much', 'few', 'old', 'small', 'high', 'american', 'long', 'possible', 'several', 'important', 'next', 'large', 'young', 'social', 'second', 'big', 'present', 'public', 'general', 'different', 'certain', 'only', 'human', 'local', 'early', 'political', 'real', 'whole', 'white', 'special', 'open', 'available', 'free', 'sure', 'major', 'full', 'necessary', 'economic', 'true']


In [20]:
bad_verbs = [w for w,count in FreqDist(bc_verbs).most_common(100)]
print(bad_verbs)

['is', 'be', 'has', 'have', 'do', 'make', 'see', 'get', 'take', 'does', 'go', 'know', 'say', 'give', 'let', 'find', 'come', 'seems', 'keep', 'think', 'tell', 'look', 'says', 'become', 'help', 'provide', 'makes', 'use', 'put', 'work', "didn't", 'bring', 'comes', 'show', 'leave', 'feel', 'meet', 'pay', 'gives', 'means', 'seem', 'hear', 'turn', 'remember', 'hold', 'try', 'determine', 'want', 'run', 'move', 'consider', 'knows', 'ask', 'live', 'write', 'understand', 'becomes', 'goes', 'mean', 'call', 'takes', 'believe', 'expect', 'develop', 'read', 'talk', 'appears', 'serve', 'provides', 'speak', 'stop', 'continue', 'play', 'build', 'remains', 'start', 'follows', 'stay', 'increase', 'prevent', 'reach', 'faces', 'looks', 'set', 'wants', 'add', 'stand', 'follow', 'like', 'carry', 'appear', 'cut', 'gets', 'shows', "it's", 'receive', 'send', 'reduce', 'allow', 'need']


In [21]:
from nltk import corpus
stops = corpus.stopwords.words('english')
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [22]:
dont_like = ['have','come','go','same','join','same','year','like'] 
dont_like = dont_like + bad_nouns + bad_adjectives + bad_verbs + stops

for word in dont_like:
    cursor.execute("DELETE from allusions WHERE (target=?)", (word,))
connection.commit()

***
Total remaining allusions?

In [23]:
cursor.execute("SELECT COUNT(*) FROM allusions")

<sqlite3.Cursor at 0x7fc959a53ce0>

In [24]:
cursor.fetchall()

[(1297296,)]

***