In [153]:
# Utilities
import pandas as pd    
import numpy as np 
import string
import re

In [154]:
# This is where your jsonToCsv file will come in
# A single dataframe represents 1 hour of 1 day and can range from 30-70k tweets
df = pd.read_json('test.jsonl.gz', lines=True, compression='gzip')      

In [155]:
# Filter tweets so that they include Trump
df_filtered = df.loc[df['full_text'].str.contains(r"(trump|donald|donaldtrump)",  re.IGNORECASE)]

In [156]:
# Feature extract hashtags from tweet 
df_filtered['hashtag'] = df_filtered['full_text'].apply(lambda x: re.findall(r"#(\w+)", x))

In [157]:
# https://pypi.org/project/tweet-preprocessor/
# !pip install tweet-preprocessor
# Cleans tweets of URLs, mentions, reserved words (RT, FAV), Emokis, Smileys
import preprocessor as p

In [158]:
# Important libraries for preprocessing using NLTK
# For lemmatization and tokenization
# !pip install nltk
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [159]:
# Apply clean function form tweet-preprocessing,
# then remove Digits, lower the text, 
# remove punctuation and remove extra whitespace 
def preprocessing(row):
    text = row['full_text']
    text = p.clean(text)
    nodigit_text = text.replace('\d+', '')
    lower_text = nodigit_text.lower()
    cleaned_text = lower_text.replace('[^\w\s]',' ').replace('\s\s+', ' ')
    remove_punct_text = re.sub(r'[^\w\s]','', cleaned_text) # Need to further test this, not working atm
    return cleaned_text

In [160]:
df_filtered['cleaned_tweet'] = df_filtered.apply(preprocessing, axis=1)

In [161]:
df_filtered['cleaned_tweet']

105      you will get the votes from trumpublican trait...
170      : newspaper endorsements:ia: quad city times i...
190      they still need closing arguments and delibera...
265                                                       
385      we cut back on education funds which lead to b...
                               ...                        
35528                                                     
35574                : mcconnell just called to coordinate
35686    : *its one day after impeachment*me: hey siri,...
35697    options arent facts. wouldnt expect a trump su...
35937        refuses to call votes. talk about do nothing.
Name: cleaned_tweet, Length: 400, dtype: object

In [148]:
def remove_stopwords(word_list):
    return [word for word in word_list if word not in stopwords.words('english')]

In [149]:
 df_filtered['cleaned_tweet'] = df_filtered['cleaned_tweet'].apply(remove_stopwords)

In [162]:
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()

def lemmatize_text(text):
    return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]

In [163]:
 df_filtered['lemm_tweet'] = df_filtered['cleaned_tweet'].apply(lemmatize_text)