# Import necessary packages
Test out the environment to make sure you have the packages needed to run this notebook.

In [None]:
# Test importing package/s.🤞 for no errors!
import pandas
import nltk

If you get any errors, please run the following cell to display your PATH environment variable. If you don't have any errors, no need to run the next cell.

In [None]:
import sys
sys.path

# Download Airbnb data
We will be using data from public Airbnb listings for this class session. This data has been collected by [Inside Airbnb](https://insideairbnb.com/) and was used for an award-winning NLP paper ([Brunila et al. 2023](https://aclanthology.org/2023.emnlp-main.284/) if you're curious.)

To download the data:
1. Go to https://insideairbnb.com/get-the-data/
2. Choose a city's data under **Data Downloads**.
3. Right-click `listings.csv.gz` and copy the link (URL).
4. Paste it as `listings_url` in the next cell.

In [None]:
listings_url = '' # FILL IN with your URL
output_filename = '' # FILL IN a name for your output file

# Download and unzip data
! wget {listings_url} -O {output_filename}.csv.gz 
! gunzip -c {output_filename}.csv.gz > {output_filename}.csv # This unzips the file into a regular CSV file

# Load data
Into a pandas dataframe

In [None]:
import pandas as pd

listings = pd.read_csv(f'{output_filename}.csv') # reads CSV file into a pandas dataframe
listings.info() # provide basic information about this dataframe
listings.head() # see first 5 rows of the dataframe

In [None]:
# Expand pandas view (good for seeing more of text)
pd.set_option('display.max_colwidth', None)
listings[['description']].head() # Look at the description text column

# Cleaning with regular expressions
Remove any extraneous text with regular expression pattern matching.
We will use [pandas' built-in functions for processing strings](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#string-methods) to do this. These functions apply to the string transformation to each element in a column.

In [None]:
pattern_to_replace = r'' # FILL IN regular expression here
replace_with = ' ' # potentially fill in with what you want it to be replaced with

listings['description_processed'] = listings['description'].str.replace(pattern_to_replace, replace_with, regex=True)
listings[['description', 'description_processed']].head()

In [None]:
# Convert NaN values to empty strings
listings['description_processed'] = listings['description_processed'].fillna('')
listings[['description', 'description_processed']].head()

# Lowercasing

In [None]:
listings['description_processed'] = listings['description_processed'] # FILL IN pandas string function to lowercase
listings[['description', 'description_processed']].head()

# Prepare to remove stopwords and punctuation
Stopwords are common "function words" that serve to connect other words and don't provide much new information. Examples are "to", "and", and "of".

We will start with a list from the `nltk` (Natural Language Toolkit) package and add punctuation, too.

In [None]:
import nltk
nltk.download('stopwords') # only need to do once

In [None]:
import string
import nltk

stops = nltk.corpus.stopwords.words('english')
print(f'NLTK stopwords: {stops}')
print(len(stops))
print()

punctuation = list(string.punctuation)
print(punctuation)
print()

stops += punctuation
len(stops)

# Tokenization
Tokenization is the process of breaking text up into words! Here we will use the `nltk` package to tokenize.

In [None]:
# Only need to do once
nltk.download('punkt_tab')

In [None]:
# Progress bar since it could take awhile
from tqdm.auto import tqdm
tqdm.pandas()

# Apply tokenizer from nltk to column
# Also remove any tokens that are stopwords
def tokenize(text):
    tokens_list = nltk.word_tokenize(text)
    tokens_list_no_stops = [tok for tok in tokens_list if not tok in stops]
    return ' '.join(tokens_list_no_stops)
    
listings['description_processed'] = listings['description_processed'].progress_map(tokenize)
listings[['description', 'description_processed']].head()

# Stemming

In [None]:
# Progress bar since it takes awhile
from tqdm.auto import tqdm
tqdm.pandas()

stemmer = nltk.PorterStemmer()

def stem(text):
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(stemmed_tokens)

listings['description_processed'] = listings['description_processed'].progress_map(stem)
listings[['description', 'description_processed']].head()