In [1]:
import pandas as pd
import ast
import nltk
import re
from string import punctuation
import matplotlib.pyplot as plt
import numpy as np
import gensim
import json
from shapely.geometry import shape
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Data Cleaning
This notebook has most of the cleaning operations done on the twitter data pulled from Twitter.

Need to make sure that the samples kept all have a place feature, so we drop any samples where place is null.

In [2]:
# list of nltk stop words used for cleaning the tweets
en_stop = set(nltk.corpus.stopwords.words('english'))

# read the data from csv and drop samples with null place features
tweets_full = pd.read_csv('trump-tweets_full3.csv')
df = tweets_full.dropna(subset=['place'])

We need to grab the location data of users from json string given from the Twitter API requests. Each json string is converted to a dictionary and then the name, IDs, and country are pulled and put into their own lists. Each list is then added in as a new feature to the dataframe.

In [None]:
# lists for new features to dataframe
ids = []
city_state = []
country = []
coordinates = []
name = []
state = []

# Get each sample json string and get each feature
for locations in df['place']:
    try:
        info = ast.literal_eval(locations)
        ids.append(info['id'])
        name.append(info['name'].lower())
        if ',' in info['full_name'] :
            state.append(info['full_name'].split(', ')[1].lower())
        else:
            state.append(info['full_name'].split(', ')[0].lower())
        city_state.append(info['full_name'])
        country.append(info['country'])
        coordinates.append(shape(info['bounding_box']))
    except ValueError:
        print(locations)


# Add new features to dataframe
df['id'] = ids
df['city'] = name
df['state'] = state
df['place'] = city_state
df['coordinates'] = coordinates
df['country'] = country

Some states inputted by users are not valid states, so they are dropped. We keep the states to be only in the United States.

In [5]:
tweets = df[df['country'] == 'United States']
tweets = tweets.drop(['location'], axis=1)
tweets = tweets.drop(['Unnamed: 0'], axis=1)
tweets = tweets[tweets['state'] != 'united states']
tweets = tweets.drop(['country'], axis=1)
tweets = tweets.drop(['coordinates'], axis=1)
tweets = tweets[tweets['state'] != 'usa']
tweets = tweets[tweets['city'] != 'united states']
tweets = tweets[tweets['state'] != 'mexican manhattan']
tweets = tweets.reset_index(drop=True)
tweets.head()

Unnamed: 0,user,tweet,date,source,place,id,city,state
0,"{'id': 4840937793, 'id_str': '4840937793', 'na...",@realDonaldTrump Praise the Lord and God Bless...,Thu May 02 23:59:33 +0000 2019,"<a href=""http://twitter.com/download/android"" ...","Dawson Springs, KY",30122139c70b1ced,dawson springs,ky
1,"{'id': 772599851192164352, 'id_str': '77259985...",@realDonaldTrump That’s right !!! Now it’s tim...,Thu May 02 23:58:01 +0000 2019,"<a href=""http://twitter.com/download/iphone"" r...","Crescent City North, CA",c0f3c245d5046c11,crescent city north,ca
2,"{'id': 717149686176550912, 'id_str': '71714968...",@realDonaldTrump Ok...can you use just a littl...,Thu May 02 23:56:21 +0000 2019,"<a href=""http://twitter.com/download/android"" ...","Hazlet, NJ",00504a961360c1a2,hazlet,nj
3,"{'id': 1036966685947953152, 'id_str': '1036966...",@realDonaldTrump Why ia your administration no...,Thu May 02 23:55:52 +0000 2019,"<a href=""http://twitter.com/download/android"" ...","Odessa, TX",2c0346ba4b733e24,odessa,tx
4,"{'id': 990070195, 'id_str': '990070195', 'name...",@realDonaldTrump Democrats are SERIOUS about t...,Thu May 02 23:55:18 +0000 2019,"<a href=""http://twitter.com/#!/download/ipad"" ...","Franklin, TN",cc631a80adacd459,franklin,tn


# Grabbing zipcodes, income, population data from city and states of each user
Now that the city and states are available for each tweet they can be used to get the zipcodes, income, and population data of each location. For each sample the city and state are used to query a database of zipcodes, median income, and population data. This data is added to the dataframe as well.

Note: This data was not used for the final dataset. The initial idea was to use this information to look for differences in political standing amongst users based on income and population in subpopulations. The approach that was tried did not work as well I had hoped it would. This is why if you look at the exploration notebook data you will see no zipcode, income, or population data.

In [8]:
import uszipcode

# object to query the database
search = uszipcode.SearchEngine(simple_zipcode=True)

In [None]:
# lists for new features
zipcode = []
population = []
median_household_income = []

# some indices don't return any information, this is used to catch exceptions
err_indx = []

# for each tweet, get zipcode, population, and median income data
for i in range(tweets.shape[0]):
    try:
        results = search.by_city_and_state(tweets['city'][i], tweets['state'][i], returns=1)[0]
        zipcode.append(results.zipcode)
        population.append(results.population)
        median_household_income.append(results.median_household_income)
    
    except IndexError:
        err_indx.append(i)
        zipcode.append('NaN')
        population.append('NaN')
        median_household_income.append('NaN')
        
    if i % 100 == 0:
        print("AT USER ", i)

# Add new features to dataframe
tweets['population_size'] = population
tweets['zipcode'] = zipcode
tweets['median_income'] = median_household_income


## More cleaning
The tweets were stripped of twitter handles (other twitter usernames), numbers social media keywords (hashtags, slang, etc). Each tweet source was taken account of by looking at the link of the source and then labeling the source by platform (android, windows, etc). This was added as another feature to the dataframe. The sentiment of each tweet was also added as a feature by the same method. 

In [12]:
# remove twitter handles, numbers, etc
cleaned_twts = []
for tweet in tweets['tweet']:
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet).split())
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    tweet = re.sub('\n', ' ', tweet)
    tweet = re.sub('\r', '', tweet)
    tweet = re.sub('"', '', tweet)
    tweet = re.sub('\'', '', tweet)
    twt = ""
    for c in tweet:
        if c not in punctuation or c == '#':
            twt += c
    cleaned_twts.append(twt)

tweets['tweet'] = cleaned_twts

# label each source and add as new feature to dataframe
source = []
android = "android"
ipad = "ipad"
iphone = "iphone"
twitter = "web"
mac = "macos"
windows = "windows"
for s in tweets['source']:
    if "Twitter for Android" in s or "Android" in s:
        source.append(android)
    elif "Twitter for iPad" in s:
        source.append(ipad)
    elif "Twitter for iPhone" in s or "iOS" in s:
        source.append(iphone)
    elif "Mac" in s:
        source.append(mac)
    elif "Windows" in s:
        source.append(windows)
    elif "Twitter Web App" in s or "Twitter Web Client" in s:
        source.append(twitter)
    else:
        source.append('Other')

tweets['source'] = source

In [13]:
tweets.head()

Unnamed: 0,user,tweet,date,source,place,id,city,state
0,"{'id': 4840937793, 'id_str': '4840937793', 'na...",Praise the Lord and God Bless our President,Thu May 02 23:59:33 +0000 2019,android,"Dawson Springs, KY",30122139c70b1ced,dawson springs,ky
1,"{'id': 772599851192164352, 'id_str': '77259985...",That s right Now it s time for the rule of law...,Thu May 02 23:58:01 +0000 2019,iphone,"Crescent City North, CA",c0f3c245d5046c11,crescent city north,ca
2,"{'id': 717149686176550912, 'id_str': '71714968...",Ok can you use just a little bit more fake pie...,Thu May 02 23:56:21 +0000 2019,android,"Hazlet, NJ",00504a961360c1a2,hazlet,nj
3,"{'id': 1036966685947953152, 'id_str': '1036966...",Why ia your administration not dealing with th...,Thu May 02 23:55:52 +0000 2019,android,"Odessa, TX",2c0346ba4b733e24,odessa,tx
4,"{'id': 990070195, 'id_str': '990070195', 'name...",Democrats are SERIOUS about this BarrHearing a...,Thu May 02 23:55:18 +0000 2019,ipad,"Franklin, TN",cc631a80adacd459,franklin,tn


In [None]:
# get sentiment of each tweet and add to dataframe as feature
sentiment_scores = []
sid = SentimentIntensityAnalyzer()

for tweet in tweets['tweet']:
    sentiment_scores.append(sid.polarity_scores(tweet)['compound'])

In [15]:
tweets['sentiment'] = sentiment_scores

In [16]:
tweets.head()

Unnamed: 0,user,tweet,date,source,place,id,city,state,sentiment
0,"{'id': 4840937793, 'id_str': '4840937793', 'na...",Praise the Lord and God Bless our President,Thu May 02 23:59:33 +0000 2019,android,"Dawson Springs, KY",30122139c70b1ced,dawson springs,ky,0.8176
1,"{'id': 772599851192164352, 'id_str': '77259985...",That s right Now it s time for the rule of law...,Thu May 02 23:58:01 +0000 2019,iphone,"Crescent City North, CA",c0f3c245d5046c11,crescent city north,ca,0.0
2,"{'id': 717149686176550912, 'id_str': '71714968...",Ok can you use just a little bit more fake pie...,Thu May 02 23:56:21 +0000 2019,android,"Hazlet, NJ",00504a961360c1a2,hazlet,nj,-0.6198
3,"{'id': 1036966685947953152, 'id_str': '1036966...",Why ia your administration not dealing with th...,Thu May 02 23:55:52 +0000 2019,android,"Odessa, TX",2c0346ba4b733e24,odessa,tx,-0.6908
4,"{'id': 990070195, 'id_str': '990070195', 'name...",Democrats are SERIOUS about this BarrHearing a...,Thu May 02 23:55:18 +0000 2019,ipad,"Franklin, TN",cc631a80adacd459,franklin,tn,-0.4881


Use the sentiment scores to get a sentiment label (positive, neutral, negative) and add this as new feature as well.

In [17]:
tweets['sentiment_type'] = ''

In [18]:
tweets.head()

Unnamed: 0,user,tweet,date,source,place,id,city,state,sentiment,sentiment_type
0,"{'id': 4840937793, 'id_str': '4840937793', 'na...",Praise the Lord and God Bless our President,Thu May 02 23:59:33 +0000 2019,android,"Dawson Springs, KY",30122139c70b1ced,dawson springs,ky,0.8176,
1,"{'id': 772599851192164352, 'id_str': '77259985...",That s right Now it s time for the rule of law...,Thu May 02 23:58:01 +0000 2019,iphone,"Crescent City North, CA",c0f3c245d5046c11,crescent city north,ca,0.0,
2,"{'id': 717149686176550912, 'id_str': '71714968...",Ok can you use just a little bit more fake pie...,Thu May 02 23:56:21 +0000 2019,android,"Hazlet, NJ",00504a961360c1a2,hazlet,nj,-0.6198,
3,"{'id': 1036966685947953152, 'id_str': '1036966...",Why ia your administration not dealing with th...,Thu May 02 23:55:52 +0000 2019,android,"Odessa, TX",2c0346ba4b733e24,odessa,tx,-0.6908,
4,"{'id': 990070195, 'id_str': '990070195', 'name...",Democrats are SERIOUS about this BarrHearing a...,Thu May 02 23:55:18 +0000 2019,ipad,"Franklin, TN",cc631a80adacd459,franklin,tn,-0.4881,


In [20]:
tweets.loc[tweets.sentiment>0,'sentiment_type']='POSITIVE'
tweets.loc[tweets.sentiment==0,'sentiment_type']='NEUTRAL'
tweets.loc[tweets.sentiment<0,'sentiment_type']='NEGATIVE'

In [21]:
tweets.to_csv('tweets_at_trump_41.csv')