# Import necessary packages
Test out the environment to make sure you have the packages needed to run this notebook.

In [None]:
# Test importing package/s.ðŸ¤ž for no errors!
import pandas
import nltk

# Load Airbnb data
We will be using the same Airbnb listing data from a previous session. Check the Files sidebar on the left of JupyterLab to see if you still have that data available. 

If so, simply <span style="color:red">fill in the name of the CSV file with the listings below</span>. If not, open and run **session3_pandas.ipynb** to download Airbnb listings from a city of your choice.

In [None]:
import pandas as pd

listings_filename = '' # FILL IN the file name for the listings data, with the csv file extension
listings = pd.read_csv(listings_filename) # reads CSV file into a pandas dataframe
listings.info() # provide basic information about this dataframe
listings.head() # see first 5 rows of the dataframe

In [None]:
# Expand pandas view (good for seeing more of text)
pd.set_option('display.max_colwidth', None)
listings[['description']].head() # Look at the description text column

# Cleaning with regular expressions
We'll be preprocessing the **description** column to get the text in a nice format for text analysis. First, we'll remove any extraneous text with regular expression pattern matching.
We will use [pandas' built-in functions for processing strings](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#string-methods) to do this. These functions apply to the string transformation to each element in a column.

<span style="color:red">Fill in a regular expression to match text that should be removed or replaced below.</span>

In [None]:
pattern_to_replace = r'' # FILL IN regular expression here
replace_with = ' ' # potentially fill in with what you want it to be replaced with

listings['description_processed'] = listings['description'].str.replace(pattern_to_replace, replace_with, regex=True)
listings[['description', 'description_processed']].head()

In [None]:
# Convert NaN values to empty strings
listings['description_processed'] = listings['description_processed'].fillna('')
listings[['description', 'description_processed']].head()

# Prepare to remove stopwords and punctuation
Stopwords are common "function words" that serve to connect other words and don't provide much new information. Examples are "to", "and", and "of".

We will start with a list from the `nltk` (Natural Language Toolkit) package and add punctuation, too.

In [None]:
import nltk
nltk.download('stopwords') # only need to do once

In [None]:
import string
import nltk

stops = nltk.corpus.stopwords.words('english')
print(f'NLTK stopwords: {stops}')
print(len(stops))
print()

punctuation = list(string.punctuation)
print(punctuation)
print()

stops += punctuation
len(stops)

# Tokenization
Tokenization is the process of breaking text up into words! Here we will use the `nltk` package to tokenize.

In [None]:
# Only need to do once
nltk.download('punkt_tab')

In [None]:
# Progress bar since it could take awhile
from tqdm import tqdm
tqdm.pandas()

# Apply tokenizer from nltk to column
# Also remove any tokens that are stopwords
def tokenize(text):
    tokens_list = nltk.word_tokenize(text)
    tokens_list_no_stops = [tok for tok in tokens_list if not tok in stops]
    return ' '.join(tokens_list_no_stops)
    
listings['description_processed'] = listings['description_processed'].progress_map(tokenize)
listings[['description', 'description_processed']].head()