In [53]:
## specify to print all output in a call
## and not just first
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Lists

In [1]:
angry_words = ["mad", "annoyed", "whyyyyyy", "seriously"]
formality = ["formal", "formal", "informal", "informal"]

In [2]:
print(angry_words[0])
new_angry_words = angry_words + ["forgot this angry word"]
new_angry_words


mad


['mad', 'annoyed', 'whyyyyyy', 'seriously', 'forgot this angry word']

In [3]:
sentence = "whyyyyyy did you do that"

## split sentence
split_sentence = sentence.split()
print(type(sentence))
print(type(split_sentence))
print(sentence)

## iterate over sentence and filter to 
## only angry words
only_angry = [word for word in sentence.split() if 
              word in angry_words]
only_angry

<class 'str'>
<class 'list'>
whyyyyyy did you do that


['whyyyyyy']

In [4]:
only_angry_sentence = " ".join(only_angry)
only_angry_sentence

'whyyyyyy'

# Dictionaries

Good for storing lists and can then retrieve using key rather than index. Unlike dataframes, don't need to be same size


In [5]:
formality_dictionary = {} 
formality_dictionary['angry_words'] = angry_words
formality_dictionary['formality'] = formality

In [6]:
formality_dictionary

{'angry_words': ['mad', 'annoyed', 'whyyyyyy', 'seriously'],
 'formality': ['formal', 'formal', 'informal', 'informal']}

# Transforming into a dataframe

In [10]:
import pandas as pd
import numpy as np

formality_df = pd.DataFrame(formality_dictionary)
formality_df.head()
formality_df['formality_binary'] = np.where(formality_df.formality == 
                                            "formal", 1, 0)
formality_df['word_and_formality'] = formality_df.angry_words + "_" + \
                                formality_df.formality
formality_df.head()

Unnamed: 0,angry_words,formality,formality_binary,word_and_formality
0,mad,formal,1,mad_formal
1,annoyed,formal,1,annoyed_formal
2,whyyyyyy,informal,0,whyyyyyy_informal
3,seriously,informal,0,seriously_informal


# Re for text wrangling

In [29]:
import re

## Task: find the part of the string with AU
strings = ["AmericanUniversity_datascience", 
           "Americanuniversity_datascience"]

## manually search each item
au_pattern = "(American[u|U]niversity)"
search_result = re.search(au_pattern, 
                          strings[0])
print(type(search_result))
print(search_result.group(1))
search_result = re.search(au_pattern, 
                          strings[1])
print(search_result.group(1))

## do via list iteration
all_results = [re.search(au_pattern, one_string).group(1) 
               for one_string in strings]
all_results


<class '_sre.SRE_Match'>
AmericanUniversity
Americanuniversity


['AmericanUniversity', 'Americanuniversity']

# String operator within pandas for text wrangling

Use airbnb listings data from here: https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data#

In [36]:
import glob

In [160]:
nyc_file = glob.glob("../data_small/*NYC*")
nyc_file

['../data_small/AB_NYC_2019.csv']

In [161]:
airbnb_nyc = pd.read_csv(nyc_file[0])

In [66]:
## view head of data
rel_cols = ['name'] + [col for col in airbnb_nyc.columns if 
                       "neighbourhood" in col]
airbnb_nyc[rel_cols].head()

## convert all words in the listing name to lowercase
airbnb_nyc['listing_lower'] = airbnb_nyc.name.str.lower()

## create a binary indicator for whether the review mentions
## the word "cozy"
airbnb_nyc['describes_cozy'] = np.where(airbnb_nyc.listing_lower.\
                                        str.contains("cozy"), 
                                        1, 0)

Unnamed: 0,name,neighbourhood_group,neighbourhood
0,Clean & quiet apt home by the park,Brooklyn,Kensington
1,Skylit Midtown Castle,Manhattan,Midtown
2,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,Harlem
3,Cozy Entire Floor of Brownstone,Brooklyn,Clinton Hill
4,Entire Apt: Spacious Studio/Loft by central park,Manhattan,East Harlem


In [67]:
airbnb_nyc[['listing_lower'] + ['describes_cozy']].head()

Unnamed: 0,listing_lower,describes_cozy
0,clean & quiet apt home by the park,0
1,skylit midtown castle,0
2,the village of harlem....new york !,0
3,cozy entire floor of brownstone,1
4,entire apt: spacious studio/loft by central park,0


In [86]:
## summarise proportion by neighborhood
airbnb_nyc.describes_cozy.value_counts()
count_neigh = pd.crosstab(airbnb_nyc.neighbourhood_group, 
                          airbnb_nyc.describes_cozy, 
                          normalize = "index")
count_neigh.columns = ["no_cozy", "cozy"]
count_neigh.sort_values(by = "cozy", ascending = False)

0    43768
1     5127
Name: describes_cozy, dtype: int64

Unnamed: 0_level_0,no_cozy,cozy
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Queens,0.861631,0.138369
Staten Island,0.873995,0.126005
Bronx,0.88451,0.11549
Brooklyn,0.897632,0.102368
Manhattan,0.902498,0.097502


# Create DTM

In [104]:
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [105]:
tokens = [wordpunct_tokenize(str(listing)) for listing in airbnb_nyc.listing_lower[0:5].tolist()]

In [132]:
list_of_strings = airbnb_nyc.listing_lower[0:20]
list_of_strings
vectorizer = CountVectorizer(lowercase = True)
dtm_sparse = vectorizer.fit_transform(list_of_strings)
dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names())
dtm_dense_named.head()

0                   clean & quiet apt home by the park
1                                skylit midtown castle
2                  the village of harlem....new york !
3                      cozy entire floor of brownstone
4     entire apt: spacious studio/loft by central park
5            large cozy 1 br apartment in midtown east
6                                      blissartsspace!
7                     large furnished room near b'way 
8                   cozy clean guest room - family apt
9                   cute & cozy lower east side 1 bdrm
10                    beautiful 1br on upper west side
11                     central manhattan/near broadway
12      lovely room 1, garden, best area, legal rental
13    wonderful guest bedroom in manhattan for singles
14                       west village nest - superhost
15                    only 2 stops to manhattan studio
16                   perfect for your parents + garden
17                                     chelsea perfect
18     hip

Unnamed: 0,1br,apartment,apt,area,backyard,bdrm,beautiful,bedroom,best,blissartsspace,...,the,to,upper,village,way,west,with,wonderful,york,your
0,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
from nltk.corpus import stopwords

In [138]:
list_stopwords = stopwords.words('english')
list_stopwords_new = list_stopwords + ['apartment', 'apt']

In [151]:
## example with one string
one_listing = airbnb_nyc.listing_lower[5]
one_listing

'large cozy 1 br apartment in midtown east'

In [152]:
## tokenize
one_listing_tokenized = wordpunct_tokenize(one_listing)
one_listing_tokenized

['large', 'cozy', '1', 'br', 'apartment', 'in', 'midtown', 'east']

In [153]:
## filter out stopwords
one_listing_nostop = [token for token in 
                    one_listing_tokenized if 
                    token not in list_stopwords_new]
one_listing_nostop


['large', 'cozy', '1', 'br', 'midtown', 'east']

In [157]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
onelisting_nostop_nopunct_stemmed_onlywords = [porter.stem(token) 
                            for token in one_listing_nostop 
                            if token.isalpha() and 
                            len(token) > 2]
onelisting_nostop_nopunct_stemmed_onlywords 

['larg', 'cozi', 'midtown', 'east']

In [None]:
## recombine into string for further processing
one_listing_nostop_string = " ".join(one_listing_nostop)
one_listing_nostop_string