In [26]:
import pandas as pd
import sqlite3
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/lwgray/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
with sqlite3.connect('data/pitchfork.sqlite') as con:
    pf_scores = pd.read_sql_query('select reviewid, score from reviews', con)
    pf_content = pd.read_sql_query('select * from content', con)

In [5]:
result = pd.merge(pf_scores, pf_content, on='reviewid')

In [38]:
class TextNormalizer(BaseEstimator, TransformerMixin):
    """ tokenize, remove punctuation, remove stopwords, lemmatize """
    
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        WORDS = nltk.corpus.stopwords.words('english')
        PUNCT = list(string.punctuation)
        STOPWORDS = set(WORDS).union(PUNCT)
        reviews = [ re.sub('[^a-z\s]', '', review.lower()) for review in reviews]                 # get rid of noise, lowercase
        words = [self.wnl.lemmatize(word) for review in reviews for word in review.split() if word not in set(STOPWORDS)]  # remove stopwords & lemmatize                                                                 e
        return words

In [43]:
a = TextNormalizer()
b = a.transform(result['content'][:3])
b

['triphop',
 'eventually',
 'became',
 'punchline',
 'musicpress',
 'shorthand',
 'overhyped',
 'hotel',
 'lounge',
 'music',
 'today',
 'muchmaligned',
 'subgenre',
 'almost',
 'feel',
 'like',
 'secret',
 'precedent',
 'listen',
 'canonical',
 'bristolscene',
 'album',
 'midlate',
 'genre',
 'starting',
 'chafe',
 'boundary',
 'youd',
 'think',
 'claustrophobic',
 'anxious',
 'st',
 'century',
 'started',
 'year',
 'ahead',
 'schedule',
 'looked',
 'right',
 'angle',
 'triphop',
 'part',
 'unbroken',
 'chain',
 'run',
 'abrasion',
 'postpunk',
 'ruminative',
 'poprbdance',
 'fusion',
 'moment',
 'best',
 'aged',
 'far',
 'gracefully',
 'forcefully',
 'anything',
 'recorded',
 'waning',
 'day',
 'record',
 'industry',
 'prefilesharing',
 'monomania',
 'right',
 'tricky',
 'rebelled',
 'attached',
 'hip',
 'scene',
 'already',
 'looking',
 'shed',
 'decamped',
 'jamaica',
 'record',
 'aggressive',
 'bristlingenergy',
 'mutation',
 'style',
 'name',
 'premillennium',
 'tension',
 'obvio

In [31]:
words = [ re.sub('[^a-z\s]', '', word.lower()) for word in result.content] 

In [32]:
words[0]

'triphop eventually became a s punchline a musicpress shorthand for overhyped hotel lounge music but today the muchmaligned subgenre almost feels like a secret precedent listen to any of the canonical bristolscene albums of the midlate s when the genre was starting to chafe against its boundaries and youd think the claustrophobic anxious st century started a few years ahead of schedule looked at from the right angle triphop\xa0is part of an unbroken chain that runs from the abrasion of s postpunk to the ruminative poprbdance fusion of the moment\xa0the best of it has aged far more gracefully and forcefully than anything recorded in the waning days of the record industrys prefilesharing monomania has any right to tricky rebelled against being attached at the hip to a scene he was already looking to shed and decamped for jamaica to record a more aggressive bristlingenergy mutation of his style in  the name\xa0premillennium tension is the only obvious thing that tells you its two decades 

In [None]:
user_defined_stop_words = ['st','rd','hong','kong'] 

i = nltk.corpus.stopwords.words('english')
j = list(string.punctuation) + user_defined_stop_words

stopwords = set(i).union(j)

def preprocess(x):
    x = re.sub('[^a-z\s]', '', x.lower())                  # get rid of noise
    x = [w for w in x.split() if w not in set(stopwords)]  # remove stopwords
    return ' '.join(x) 

df['Clean_addr'] = df['Adj_Addr'].apply(preprocess)