In [64]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pylab as plt
import math
from scipy import stats
import geopandas as gpd
%matplotlib inline
from functools import reduce

In [65]:
import importlib
import mr_word_count
importlib.reload(mr_word_count)
from mr_word_count import MRWordFrequencyCount
from mrjob.job import MRJob
import mapreduce as mr

In [66]:
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [67]:
df = pd.read_csv('Inwood.csv')

In [68]:
df.head()

Unnamed: 0,ZIPCODE,description,Neighborhood
0,10034,"At the northernmost end of Manhattan Island, i...",Inwood
1,10034,From the beginning. Guadalupe Bar and Grill ha...,Inwood
2,10034,"""We offer a new concept, dedicated to rescue o...",Inwood
3,10040,"The Cloisters museum and gardens, the branch o...",Inwood
4,10034,The Dyckman Farmhouse Museum is a visual treat...,Inwood


In [69]:
df['description'] = df['description'].str[:161]

In [70]:
# this link is really helpful in understanding what each of the nlt codes does (tokenizer, remove words, lemmatizer, etc.)
# all the lambda apply's and functions you see below are because of this blog post

# https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f


In [71]:
# tokenizer turns each string in the description column to a list of words

tokenizer = RegexpTokenizer('\s+', gaps=True)

In [72]:
df['description'] = df['description'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [73]:
df['description'].head(10)

0    [at, the, northernmost, end, of, manhattan, is...
1    [from, the, beginning., guadalupe, bar, and, g...
2    ["we, offer, a, new, concept,, dedicated, to, ...
3    [the, cloisters, museum, and, gardens,, the, b...
4    [the, dyckman, farmhouse, museum, is, a, visua...
5    [this, marvelous, experience, came, as, a, res...
6    [we, have, expanded, to, a, 1,500, square, foo...
7    [this, business, was, founded, on, the, premis...
8    [havana, tacos, was, inspired, by, a, diverse,...
9    [in, what, was, once, a, desolate, industrial,...
Name: description, dtype: object

In [74]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [75]:
df['description'] = df['description'].apply(lambda x: remove_stopwords(x))

In [76]:
# lemmatizer not totally sure what it does but it helps with the getting the gist of the word
# gotta read the towards data science post

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [77]:
# if you'll notice here, we don't set the column equal to the updated applied version - I think that results in errors
# I tried before, it didn't work, and had to rerun code from the beginning, more than once

df['description'].apply(lambda x: word_lemmatizer(x))

0     [northernmost, end, manhattan, island,, shadow...
1     [beginning., guadalupe, bar, grill, built, rep...
2     ["we, offer, new, concept,, dedicated, rescue,...
3     [cloister, museum, gardens,, branch, metropoli...
4     [dyckman, farmhouse, museum, visual, treat, ev...
5     [marvelous, experience, came, result, influenc...
6     [expanded, 1,500, square, foot, store, include...
7     [business, founded, premise, family., el, nuev...
8     [havana, taco, inspired, diverse, cultural, bl...
9     [desolate, industrial, district,, come, next, ...
10    [experience, essence, latin, fusion, cuisine,,...
11    [lower, level, trie, cloister, cloister, museu...
12    [wahi’s, industrial, open, space, brings, comm...
13    [located, manhattan’s, historic, inwood, neigh...
14    [specialize, running, pool, hall, caters, ages...
15    [come, enjoy, good, family, atmosphere,, find,...
16    [bocaditos, bistro, opened, door, september, 2...
17    [established, 1996,, genesis, first, ecuad

In [78]:
# Okay, this is where it gets real clunky, and a for loop could've made life easier, but oh well
# I created brand new df's for each neighborhood (just duplicates essentially)
# each letter for a respective neighborhood

df['ZIPCODE'] = df['ZIPCODE'].astype(int)
df['description'] = df['description'].astype(str)

In [79]:
# then here, I created a 8 string var's (one for each neighborhood df)
# concatenate all the description strings for each neighborhood into one string

a = df['description'].str.cat()

In [80]:
# Here, the words_'letter' are lists of all of all the words
# the wordCount_'letter' are dictionaries of the words and their counts
# again, there's 8 of these, cause 8 df's / neighborhoods


from collections import Counter

words_a = a.split()
wordCount_a = Counter(words_a) #Chelsea

In [81]:
df1 = pd.DataFrame.from_dict(wordCount_a, orient='index').reset_index()
df1 = df1.rename(columns={"index": "word", 0:'count'})
df1['word'] = df1['word'].str.replace(r'[^\w\s]+', '')
df1 = df1[df1.word != '']
df1 = df1.groupby(['word'], as_index=False)['count'].sum()
# df1.drop( df1[ df1['count'] == 1 ].index , inplace=True)
# df1.drop( df1[ df1['count'] == 2 ].index , inplace=True)

In [97]:
if df1[df1['word'] == "york"].size>0:
    b = df1.loc[df1['word'] == 'new', 'count'].item()
    c = df1.loc[df1['word'] == 'york', 'count'].item()

    if b == c:
        df1.drop(df1[ df1['word'] == 'york' ].index , inplace=True)
        df1.drop(df1[ df1['word'] == 'new' ].index , inplace=True)
    else:
        d = b - c
        df1.loc[df1.word == "new", "count"] = d
        df1.drop(df1[ df1['word'] == 'york' ].index , inplace=True)

In [98]:
df1 = df1.sort_values('count', ascending=False)

In [None]:
# created new df's that took the top 150 values for each of the 8 neighborhood's / df's
# df100 corresponds to df1 which corresponds to Chelsea, so on and so forth...

df100 = df1.head(150)

In [None]:
df100 = df100.reset_index(drop=True)
df100.head()