# Twitter Cleaning

In [1]:
import pandas as pd
import re

from string import punctuation
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('../Data/twitter_states_2014_19.csv')

df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count
0,And we could spend the rest of the money on ra...,Alabama,samd9273,2019-10-31 22:11:35,0,0,23
1,We sorry to hear the power is out. Please DM t...,Alabama,alabamapower,2019-10-31 21:53:10,0,0,23
2,@DrX1304 think of how many ducks we could buy ...,Alabama,samd9273,2019-10-31 21:44:15,0,1,13
3,It’s spooky season,Alabama,OMGItsBlackout,2019-10-31 18:17:20,0,0,3
4,Newsom Raked In Big Money From Public Utility ...,Alabama,deenie7940,2019-10-31 13:58:11,0,0,19


In [3]:
df['tweet'][4]

'Newsom Raked In Big Money From Public Utility He Is Now Chiding For Fires And Rolling Blackouts https://townhall.com/tipsheet/bethbaumann/2019/10/30/newsom-raked-in-big-money-from-public-utility-he-is-now-chiding-for-fires-and-rol-n2555643\xa0…'

In [4]:
df.shape

(19948, 7)

In [5]:
# remove duplicate tweets
df = df.drop_duplicates(subset=['tweet', 'location', 'user_name', 'time_stamp'], keep='last').reset_index(drop=True)
df.shape

(18990, 7)

___
## remove links

In [6]:
# remove any twitter pic urls
df['tweet'] = [re.sub(r'pic.twitter.com\S+', '', post).strip() for post in df['tweet']]

In [7]:
# remove any http urls
df['tweet'] = [re.sub(r'http\S+', '', post).strip() for post in df['tweet']]

In [8]:
[tweet for tweet in df['tweet']][:5]

['And we could spend the rest of the money on rabbits that we can tend to living off the fatta of the land',
 'We sorry to hear the power is out. Please DM the full service address, so we can get the issue reported. -Dorie j',
 '@DrX1304 think of how many ducks we could buy with this money\xa0…',
 'It’s spooky season',
 'Newsom Raked In Big Money From Public Utility He Is Now Chiding For Fires And Rolling Blackouts \xa0…']

___
## Tokenizing

In [9]:
# instatiate the tokenizer
tknr = RegexpTokenizer(r'[a-zA-Z&0-9]+')

# start with empty lists
tokens = []

# fill the list with tokenized versions of each post title
for post in df['tweet']:
    tokens.append(" ".join(tknr.tokenize(post.lower())))

df['tweet'] = tokens

df = df.reset_index(drop=True)


In [10]:
df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count
0,and we could spend the rest of the money on ra...,Alabama,samd9273,2019-10-31 22:11:35,0,0,23
1,we sorry to hear the power is out please dm th...,Alabama,alabamapower,2019-10-31 21:53:10,0,0,23
2,drx1304 think of how many ducks we could buy w...,Alabama,samd9273,2019-10-31 21:44:15,0,1,13
3,it s spooky season,Alabama,OMGItsBlackout,2019-10-31 18:17:20,0,0,3
4,newsom raked in big money from public utility ...,Alabama,deenie7940,2019-10-31 13:58:11,0,0,19


In [11]:
[tweet for tweet in df['tweet']][:5]

['and we could spend the rest of the money on rabbits that we can tend to living off the fatta of the land',
 'we sorry to hear the power is out please dm the full service address so we can get the issue reported dorie j',
 'drx1304 think of how many ducks we could buy with this money',
 'it s spooky season',
 'newsom raked in big money from public utility he is now chiding for fires and rolling blackouts']

___
## Lemmatizing

In [12]:
# Instatiate the lemmatizer
port = PorterStemmer()

In [13]:
# start with an empty list
port_tweet = []

# Lemmatize the words in each post and add them to the list
for post in df['tweet']:
    port_tweet.append(' '.join([port.stem(word) for word in post.split()]))


df['port_tweet'] = port_tweet

In [14]:
df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count,port_tweet
0,and we could spend the rest of the money on ra...,Alabama,samd9273,2019-10-31 22:11:35,0,0,23,and we could spend the rest of the money on ra...
1,we sorry to hear the power is out please dm th...,Alabama,alabamapower,2019-10-31 21:53:10,0,0,23,we sorri to hear the power is out pleas dm the...
2,drx1304 think of how many ducks we could buy w...,Alabama,samd9273,2019-10-31 21:44:15,0,1,13,drx1304 think of how mani duck we could buy wi...
3,it s spooky season,Alabama,OMGItsBlackout,2019-10-31 18:17:20,0,0,3,it s spooki season
4,newsom raked in big money from public utility ...,Alabama,deenie7940,2019-10-31 13:58:11,0,0,19,newsom rake in big money from public util he i...


In [15]:
[tweet for tweet in df['port_tweet']][:5]

['and we could spend the rest of the money on rabbit that we can tend to live off the fatta of the land',
 'we sorri to hear the power is out pleas dm the full servic address so we can get the issu report dori j',
 'drx1304 think of how mani duck we could buy with thi money',
 'it s spooki season',
 'newsom rake in big money from public util he is now chide for fire and roll blackout']

___
# check for nulls

In [16]:
df.isnull().sum()

tweet               0
location            0
user_name           0
time_stamp          0
num_retweets        0
num_likes           0
tweet_word_count    0
port_tweet          0
dtype: int64

In [17]:
df = df.dropna().reset_index(drop=True)
df.shape

(18990, 8)

___
# formating date and time

In [18]:
df['time_stamp']

0        2019-10-31 22:11:35
1        2019-10-31 21:53:10
2        2019-10-31 21:44:15
3        2019-10-31 18:17:20
4        2019-10-31 13:58:11
                ...         
18985    2014-01-03 11:33:05
18986    2014-01-03 09:31:31
18987    2014-01-03 08:46:49
18988    2014-01-02 06:14:09
18989    2014-01-01 03:54:18
Name: time_stamp, Length: 18990, dtype: object

In [19]:
df['time_location'] = [f"{date.split()[0]} {date.split()[1].split(':')[0]}:00:00 | {df['location'][i]}" 
                   for i, date in enumerate(df['time_stamp'])]

In [20]:
df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count,port_tweet,time_location
0,and we could spend the rest of the money on ra...,Alabama,samd9273,2019-10-31 22:11:35,0,0,23,and we could spend the rest of the money on ra...,2019-10-31 22:00:00 | Alabama
1,we sorry to hear the power is out please dm th...,Alabama,alabamapower,2019-10-31 21:53:10,0,0,23,we sorri to hear the power is out pleas dm the...,2019-10-31 21:00:00 | Alabama
2,drx1304 think of how many ducks we could buy w...,Alabama,samd9273,2019-10-31 21:44:15,0,1,13,drx1304 think of how mani duck we could buy wi...,2019-10-31 21:00:00 | Alabama
3,it s spooky season,Alabama,OMGItsBlackout,2019-10-31 18:17:20,0,0,3,it s spooki season,2019-10-31 18:00:00 | Alabama
4,newsom raked in big money from public utility ...,Alabama,deenie7940,2019-10-31 13:58:11,0,0,19,newsom rake in big money from public util he i...,2019-10-31 13:00:00 | Alabama


In [21]:
# save our cleaned twitter data
df.to_csv('../Data/twitter_states_cleaned_2014_19.csv', index=False)