In [34]:
#import packages
import numpy as np
import csv
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import tweepy
import re 
import sys

In [35]:
# download nltk packages
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/mark/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mark/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
# define helper functions

def remove_punctuations(text):
    """
        Function: removes punctuation from a line of text. Also removes emojis
    """

    # remove emojis
    text = re.sub(r'\\U00\S+', ' ', text)
    text = re.sub(r'\\u\S+', ' ', text)
    
    # remove weird text with \\n
    text = re.sub(r'\\n\S+', ' ', text)
    
    # remove punctuation
    text = re.sub(r'[^\w\s]',' ', text)

    # return clean text
    return text

def remove_links(tweet):
    """
        Function: takes a string and removes web links from it
    """
    
    # remove links
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = re.sub(r'bitly/\S+', '', tweet) # rempve bitly links
    
    # return cleaned text
    return tweet

def remove_users(tweet):
    """
        Function: takes a string and removes retweet and @user information
    """
    # remove user references
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    
    return tweet

def word_lemmatizer(tweet):
    """
        Function: takes a string, tokenizes it (separates it into distinct words) 
        and lemmatizes it (simplifies to root form)
        
    """
    # initializer lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tokenize, split sentence into words
    word_list = nltk.word_tokenize(tweet)
    
    # lemmatize list of words and join
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in word_list])
    
    # return output
    return lemmatized_output

def eliminate_stopwords(tweet):
    
    """
        Function: takes a tweet, separates it by whitespace delimiter, then takes out stopwords (e.g., "the", "and")
    """
    # define stopwords
    stop_words = set(stopwords.words('english'))
    
    # get cleaned tweet
    cleaned_tweet = [w for w in tweet.split() if not w in stop_words]
    
    # re-join tweet
    cleaned_tweet = ' '.join(cleaned_tweet)
    
    # return cleaned tweet
    return cleaned_tweet

In [37]:
###### import data ######
data_dir = "../data/labeled_tweets_climateChange.csv"

# keep tweets in array:
tweet_text = []

with open(data_dir) as csv_file:
    readCSV = csv.reader(csv_file, delimiter = ',')

    # iterate by row
    for row in readCSV:
        # append to array that holds tweets
        tweet_text.append(row[1])
        
# take out first entry, which just has "tweet" 
tweet_text = tweet_text[1:]

In [38]:
# get stopwords list
stopwords_list = nltk.corpus.stopwords.words('english')

In [39]:
# convert to lowercase
lowercase_tweets = [text.lower() for text in tweet_text]

In [40]:
# remove links
tweet_no_link = [remove_links(tweet) for tweet in lowercase_tweets]

In [41]:
# remove users
tweet_no_users = [remove_users(tweet) for tweet in tweet_no_link]

In [42]:
# remove punctuation
tweet_no_punctuation = [remove_punctuations(tweet) for tweet in tweet_no_users]

In [43]:
# lemmatize words
tweets_lemmatized = [word_lemmatizer(tweet) for tweet in tweet_no_punctuation]

In [55]:
# remove stopwords
tweets_no_stopwords = [eliminate_stopwords(tweet) for tweet in tweets_lemmatized]

In [87]:
# export as csv
with open("../data/cleaned_tweets.csv", 'w', newline = '') as csv_file:
    wr = csv.writer(csv_file, delimiter = '\n')
    clean_tweets = [tweets_no_stopwords]
    for tweet in clean_tweets:
        wr.writerow(tweet)