In [None]:
import pandas as pd

df = pd.read_csv(r'https://raw.githubusercontent.com/mawalz05/PLSC381B_Analyzing_Politics_Python/main/20201130_183128_blm_tweets.csv')

print(df.shape)
df.head()

In [None]:
df['text']

In [None]:
# Extract first 10 observations to speed up the process
df_10 = df[:10]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')

# Use the nltk tokenizer to tokenize the words and create a new column
df_10['tokens'] = df_10['text'].apply(word_tokenize)
df_10['tokens']

In [None]:
# lowercase all words
df_10['tokens'] = df_10['tokens'].apply(lambda x: [str.lower(word) for word in x])
df_10['tokens']

In [None]:
# Removing the stop words from the tokens
df_10['tokens'] = df_10['tokens'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])
df_10['tokens']

In [None]:
stopwords.words('english')

In [None]:
# Lemmatizing the tokens
lemma = nltk.stem.WordNetLemmatizer()
df_10['lem_tokens'] = df_10['tokens'].apply(lambda x: [lemma.lemmatize(word) for word in x])
df_10['lem_tokens']

In [None]:
# Stemming the tokens (for automated response use)
p_stemmer = nltk.stem.PorterStemmer()
df_10['stem_tokens'] = df_10['tokens'].apply(lambda x: [p_stemmer.stem(word) for word in x])
df_10['stem_tokens']

In [None]:
# Removing long words and short words
df_10['tokens'] = df_10['tokens'].apply(lambda x: [word for word in x if len(word) < 20 and len(word) >= 3])
df_10['tokens']

In [None]:
# Removing punctuation
punct = ["," , "." , ":" , ";" , "'" , '"', '!', '?', '--', '(', ')', '@']
df_10['tokens'] = df_10['tokens'].apply(lambda x: [word for word in x if word not in punct])
df_10['tokens']

In [None]:
def word_process_counts(df): 
# This function counts word frequencies, word proportions, and length of the tweet
    def count_freq(tweet):
        word_counts = {} # Create empty dictionary to host the word_counts
        for word in tweet: # For every word in the tweet
            if word in list(word_counts.keys()): # if the word is already a key
                word_counts[word] += 1 # Add a 1 to that word's count
            else:
                word_counts[word] = 1 # Otherwise put a word for the new word to start the count
        return word_counts
    
    def count_prop(tweet):
        word_counts = {} # Create empty dictionary for word count
        prop = {} # Create empty dictionary for proportions
        total_count = len(tweet) # This is a count of all the words in the tweet
        for word in tweet: # for every word in the tweet
            if word in list(word_counts.keys()): #if the word is already a key
                word_counts[word] += 1 # add a one to that word's count
            else:
                word_counts[word] = 1 # otherwise put a word for the new word to start the count
        
        for word in tweet: #for every word in the tweet
            prop[word] = round(word_counts[word]/total_count, 2) #take the count for that word and divide it by the total count
        return prop # Round the proportion count to the second decimal
    
    def total_length(tweet):
        total_count = len(tweet) # Counts the number of words in the tweet
        return total_count

    df['word_counts'] = df['tokens'].apply(count_freq) # apply count_freq to the tokens column
    df['word_proportions'] = df['tokens'].apply(count_prop) # apply count_prop to the tokens column
    df['tweet_length'] = df['tokens'].apply(total_length) # apply total_length to the tokens column

In [None]:
# Running the function we just created
word_process_counts(df_10)

In [None]:
df_10.head()

In [None]:
# Find the overall counts for all unique words in the df
counts = df_10['tokens'].apply(pd.value_counts).sum()
print(counts)

In [None]:
counts.sort_values(ascending = False)