# Cleaning

Removing some unwanted target words from the allusion database.

In [1]:
import sqlite3
connection = sqlite3.connect("wiki_to_use.db")
cursor = connection.cursor()

**Get rid of everything but `VERB`,`NOUN`, and `ADJ` targets.**

In [3]:
cursor.execute("SELECT target,target_pos FROM allusions")
rows = cursor.fetchall()

word_pos_tups = []

for r in rows:
    word_pos_tups.append(r)

In [4]:
len(word_pos_tups)

2255871

In [5]:
word_pos_tups[:5]

[('children', 'NOUN'),
 ('ease', 'VERB'),
 ('modest', 'ADJ'),
 ('juvenalian', 'ADJ'),
 ('satirical', 'ADJ')]

In [6]:
all_target_tags = list(set([tag for word,tag in word_pos_tups]))
all_target_tags

['ADV',
 'X',
 'PART',
 'NOUN',
 'PRON',
 'ADP',
 'SCONJ',
 'VERB',
 'SYM',
 'CCONJ',
 'DET',
 'PUNCT',
 'PROPN',
 'ADJ',
 'AUX',
 'INTJ',
 'NUM']

In [7]:
for tag in all_target_tags:
    if tag not in ["ADJ","NOUN","VERB"]:
        cursor.execute("DELETE from allusions WHERE target_pos=?", (tag,))
connection.commit()

**Get rid of everything that isn't in wordnet with the correct part of speech.**

In [8]:
cursor.execute("SELECT target,target_pos FROM allusions")
rows = cursor.fetchall()

word_pos_tups = []

for r in rows:
    word_pos_tups.append(r)

In [9]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/kyle/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
from nltk.corpus import wordnet

In [11]:
#wordnet.synsets('dog','p')

In [12]:
word_pos_tups[:10]

[('children', 'NOUN'),
 ('ease', 'VERB'),
 ('modest', 'ADJ'),
 ('juvenalian', 'ADJ'),
 ('satirical', 'ADJ'),
 ('british', 'ADJ'),
 ('juvenalian', 'ADJ'),
 ('satirical', 'ADJ'),
 ('essay', 'NOUN'),
 ('papist', 'NOUN')]

In [13]:
%%time

to_delete_based_on_wordnet = []
for word,tag in list(set(word_pos_tups)):
    synsets = wordnet.synsets(word.lower(), pos=tag.lower()[0])
    if len(synsets)==0:
        to_delete_based_on_wordnet.append((word,tag))
#         cursor.execute("DELETE from allusions WHERE (target=? AND target_pos=?)", (word,tag))
#connection.commit()

CPU times: user 3.81 s, sys: 60.5 ms, total: 3.87 s
Wall time: 3.9 s


In [14]:
len(to_delete_based_on_wordnet)

17131

In [15]:
from tqdm import tqdm

In [16]:
## huh...outrageously slow to delete from sqlite
## should not have used sqlite
## should have used a dictionary
## too late now, maybe change later
for i in tqdm(to_delete_based_on_wordnet):
    synsets = wordnet.synsets(word.lower(), pos=tag.lower()[0])
    cursor.execute("DELETE from allusions WHERE (target=? AND target_pos=?)", i)

connection.commit()

100%|█████████████████████████████████████| 17131/17131 [57:29<00:00,  4.97it/s]


**Stopwords+**

In [17]:
from nltk import corpus
from nltk import FreqDist

bc = corpus.brown.words()

from nltk import pos_tag
bc_tagged = pos_tag(bc)

bc_nouns = [w.lower() for w,t in bc_tagged if t=="NN"]
bc_adjectives = [w.lower() for w,t in bc_tagged if t=="JJ"]
bc_verbs = [w.lower() for w,t in bc_tagged if t in ["VB","VBZ"]]

In [18]:
bad_nouns = [w for w,count in FreqDist(bc_nouns).most_common(25)]
bad_nouns

['time',
 'man',
 'way',
 'world',
 'life',
 'year',
 'day',
 'work',
 'state',
 'home',
 'place',
 'part',
 'number',
 'course',
 'fact',
 'water',
 'something',
 'hand',
 'school',
 'head',
 'house',
 'night',
 'nothing',
 'system',
 'group']

In [19]:
bad_adjectives = [w for w,count in FreqDist(bc_adjectives).most_common(30)]
bad_adjectives

['other',
 'such',
 'new',
 'first',
 'many',
 'good',
 'little',
 'own',
 'same',
 'last',
 'great',
 'much',
 'few',
 'old',
 'small',
 'high',
 'american',
 'long',
 'possible',
 'several',
 'important',
 'next',
 'large',
 'young',
 'social',
 'second',
 'big',
 'present',
 'public',
 'general']

In [20]:
bad_verbs = [w for w,count in FreqDist(bc_verbs).most_common(50)]
bad_verbs

['is',
 'be',
 'has',
 'have',
 'do',
 'make',
 'see',
 'get',
 'take',
 'does',
 'go',
 'know',
 'say',
 'give',
 'let',
 'find',
 'come',
 'seems',
 'keep',
 'think',
 'tell',
 'look',
 'says',
 'become',
 'help',
 'provide',
 'makes',
 'use',
 'put',
 'work',
 "didn't",
 'bring',
 'comes',
 'show',
 'leave',
 'feel',
 'meet',
 'pay',
 'gives',
 'means',
 'seem',
 'hear',
 'turn',
 'remember',
 'hold',
 'try',
 'determine',
 'want',
 'run',
 'move']

In [21]:
from nltk import corpus
stops = corpus.stopwords.words('english')
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [22]:
dont_like = ['have','come','go','same','join','same','year'] 
dont_like = dont_like + bad_nouns + bad_adjectives + bad_verbs + stops

for word in dont_like:
    cursor.execute("DELETE from allusions WHERE (target=?)", (word,))
connection.commit()

***
Total remaining allusions?

In [23]:
cursor.execute("SELECT COUNT(*) FROM allusions")

<sqlite3.Cursor at 0x7f8a38973d50>

In [24]:
cursor.fetchall()

[(1439179,)]

***