# Text preprocessing

## Imports

In [1]:
from src.WeatherSentimentData import WeatherSentimentData
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd

## Downloads

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mmakaranka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mmakaranka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mmakaranka\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mmakaranka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data

In [3]:
weather_data = WeatherSentimentData('data', use_generated_data=True)
df = weather_data.full_data
df['tweet_text']

0       Grilling kabobs on the grill last night was am...
1       The slowest day ever !! And the weather makes ...
2       Fire Weather Watch issued May 17 at 4:21PM CDT...
3       Im going to lunch early today.   The weather i...
4       Weekend Weather Causes Delays In I-270 Bridge ...
                              ...                        
1469    Going to the gym with my cousin. We're going t...
1470    Today is a nice day. The air is fresh and the ...
1471       I hate this weather. It's so hot and dry. #hot
1472    @mention That's excellent. I'm glad you liked ...
1473    Going to the market with my grandma. We're goi...
Name: tweet_text, Length: 1474, dtype: object

## Converting to lowercase

In [4]:
processed_text = df['tweet_text'].str.lower()

## Text cleaning

### Removing URLs and mentions

In [5]:
processed_text = processed_text.replace('@mention', '', regex=False)
processed_text = processed_text.replace('{link}', '', regex=False)

### Removing remove non-word and non-whitespace characters

In [6]:
processed_text = processed_text.replace(to_replace=r'[^\w\s]', value='', regex=True)

### Removing digits

In [7]:
processed_text = processed_text.replace(to_replace=r'\d', value='', regex=True)

## Tokenization

In [8]:
processed_text = processed_text.apply(word_tokenize)

## Stopwords removal

In [9]:
def filter_stopwords(tweet, stopwords):
    return [word for word in tweet if word not in stopwords]

stopword_list = stopwords.words('english')
processed_text = processed_text.apply(filter_stopwords, stopwords=stopword_list)

## Stemming

In [10]:
stemmer = PorterStemmer()

def stem_tweet(tweet):
    return [stemmer.stem(word) for word in tweet]

processed_text = processed_text.apply(stem_tweet)
df['tweet_text'] = processed_text

## Word frequencies

In [11]:
counts = df["tweet_text"].apply(lambda x: pd.Series(x).value_counts()).sum(axis=0)

In [12]:
counts.sort_values(ascending=False).head(20)

weather    585.0
mention    486.0
link       378.0
im         172.0
go         171.0
day        165.0
rt         136.0
storm      132.0
today      124.0
hot        116.0
f          102.0
sunni      101.0
feel        99.0
sunshin     83.0
humid       75.0
love        74.0
like        73.0
snow        72.0
rain        71.0
need        70.0
dtype: float64