# PSTAT 135 Final Project: Data Preprocessing

In [None]:
# global imports
import pyspark
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as W

spark = SparkSession.builder.getOrCreate()

# Examine Dataset

In [None]:
# import data
tweets = spark.read.csv('Tweets.csv', header=True, inferSchema=True)
# output dataframe
tweets.toPandas().head(2)

In [None]:
# output dimensions of the dataset
print("Number of rows: ",tweets.count())
print("Number of columns: ",len(tweets.columns))

In [None]:
# output data types of each column
tweets.dtypes

# Duplicates

In [None]:
# number of distinct observations
tweets.distinct().count()

In [None]:
# drop duplicates
tweets = tweets.dropDuplicates()
# confirm they were dropped
tweets.count()

In [None]:
# count of distinct rows excluding id
tweets.select([c for c in tweets.columns if c!='tweet_id']).distinct().count()

In [None]:
# output values that are duplicates for all columns except tweet_id
tweets.groupBy('airline_sentiment','airline_sentiment_confidence','negativereason',
               'negativereason_confidence','airline','airline_sentiment_gold',
               'name','negativereason_gold','retweet_count','text','tweet_coord',
               'tweet_created','tweet_location','user_timezone')\
    .count()\
    .where(W.col('count')>1)\
    .toPandas()

In [None]:
# drop duplicates for rows excluding id
tweets = tweets.dropDuplicates(subset=[c for c in tweets.columns if c!='tweet_id'])
# confirm duplicates were dropped
tweets.count()

# Missing Values

We will remove all of the rows with missing values for `text` since this column will be used to do sentiment analysis. There are missing values for some of the variables that will be used in exploratory data analysis, and we will ignore these missing values. 

In [None]:
# number of missing values for each column
tweets.select(*(W.sum(W.col(c).isNull().cast('int')).alias(c) for c in tweets.columns)).toPandas()

In [None]:
# remove rows where text is missing
tweets = tweets.dropna(subset='text')

# Select Desired Columns

In [None]:
# select columns
tweets = tweets.select('airline_sentiment','negativereason','airline',
                       'retweet_count','text','tweet_created',
                       'tweet_location','user_timezone')
# output dataframe
tweets.toPandas().head(2)