# PSTAT 135 Final Project: Data Preprocessing

In [1]:
# global imports
import pyspark
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as W

spark = SparkSession.builder.getOrCreate()

# Examine Dataset

In [2]:
# import data
tweets = spark.read.csv('Tweets.csv', header=True, inferSchema=True)
# output dataframe
tweets.toPandas().head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0.0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0.0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [3]:
# output dimensions of the dataset
print("Number of rows: ",tweets.count())
print("Number of columns: ",len(tweets.columns))

Number of rows:  14837
Number of columns:  15


In [4]:
# output data types of each column
tweets.dtypes

[('tweet_id', 'string'),
 ('airline_sentiment', 'string'),
 ('airline_sentiment_confidence', 'string'),
 ('negativereason', 'string'),
 ('negativereason_confidence', 'string'),
 ('airline', 'string'),
 ('airline_sentiment_gold', 'string'),
 ('name', 'string'),
 ('negativereason_gold', 'string'),
 ('retweet_count', 'int'),
 ('text', 'string'),
 ('tweet_coord', 'string'),
 ('tweet_created', 'string'),
 ('tweet_location', 'string'),
 ('user_timezone', 'string')]

# Duplicates

In [5]:
# number of distinct observations
tweets.distinct().count()

14785

In [6]:
# drop duplicates
tweets = tweets.dropDuplicates()
# confirm they were dropped
tweets.count()

14785

In [7]:
# count of distinct rows excluding id
tweets.select([c for c in tweets.columns if c!='tweet_id']).distinct().count()

14751

In [8]:
# output values that are duplicates for all columns except tweet_id
tweets.groupBy('airline_sentiment','airline_sentiment_confidence','negativereason',
               'negativereason_confidence','airline','airline_sentiment_gold',
               'name','negativereason_gold','retweet_count','text','tweet_coord',
               'tweet_created','tweet_location','user_timezone')\
    .count()\
    .where(W.col('count')>1)\
    .toPandas()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,count
0,,,,,,,,,,,,,,,34
1,negative,1.0,Customer Service Issue,1.0,US Airways,,istackfranklins,,0.0,@USAirways,,,,,2


In [9]:
# drop duplicates for rows excluding id
tweets = tweets.dropDuplicates(subset=[c for c in tweets.columns if c!='tweet_id'])
# confirm duplicates were dropped
tweets.count()

14751

# Missing Values

We will remove all of the rows with missing values for `text` since this column will be used to do sentiment analysis. There are missing values for some of the variables that will be used in exploratory data analysis, and we will ignore these missing values. 

In [10]:
# number of missing values for each column
tweets.select(*(W.sum(W.col(c).isNull().cast('int')).alias(c) for c in tweets.columns)).toPandas()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,0,107,21,5508,4164,131,14702,147,14719,156,156,13686,338,4949,5046


In [11]:
# remove rows where text is missing
tweets = tweets.dropna(subset='text')

# Select Desired Columns

In [12]:
# select columns
tweets = tweets.select('airline_sentiment','negativereason','airline',
                       'retweet_count','text','tweet_created',
                       'tweet_location','user_timezone')
# output dataframe
tweets.toPandas().head(2)

Unnamed: 0,airline_sentiment,negativereason,airline,retweet_count,text,tweet_created,tweet_location,user_timezone
0,negative,Flight Booking Problems,US Airways,0,"@USAirways but wait! They are booked, along wi...",2015-02-20 20:54:01 -0800,"Asheville, NC",Eastern Time (US & Canada)
1,negative,Bad Flight,US Airways,0,@USAirways yes. Every one of these on every f...,2015-02-19 13:03:35 -0800,Arizona,Arizona
