In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [2]:
# create your spark app and session
from pyspark import SparkFiles

spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()

In [3]:
# Load in data
df = pd.read_csv("Output/live_output.csv")

In [4]:
df['Clean Tweets'] = df['Tweet'].map(lambda row: row.replace('#', ''))
df.head()

Unnamed: 0,DateTime,Tweet,Handle,Followers,Location,Retweet Count,Sentiment,Clean Tweets
0,2019-07-06 17:23:51,$BCH and $BSV #hodler... smh \n🤣🤣🤣🤣\n\n#crypto...,KoalaCryptos,564,The Moon,0,1,$BCH and $BSV hodler... smh \n🤣🤣🤣🤣\n\ncrypto b...
1,2019-07-06 17:23:43,LAST TRADE: SELL 0.03562599BTC@10287.1EUR\nSEL...,digital_mine_,5750,STEEM BLOCKCHAIN,0,1,LAST TRADE: SELL 0.03562599BTC@10287.1EUR\nSEL...
2,2019-07-06 17:23:05,With this project that consists of a good team...,maruf07388605,734,Bangladesh,0,2,With this project that consists of a good team...
3,2019-07-06 17:23:02,Join the faucet hub for free #bitcoin #cryptoc...,BitcoinTap,270,Tokyo,0,2,Join the faucet hub for free bitcoin cryptocur...
4,2019-07-06 17:22:55,"Roses are red,\nViolets are blue,\nAnd Craig W...",Coin_Brawl,148,,0,1,"Roses are red,\nViolets are blue,\nAnd Craig W..."


In [5]:
df['Clean Tweets']

0       $BCH and $BSV hodler... smh \n🤣🤣🤣🤣\n\ncrypto b...
1       LAST TRADE: SELL 0.03562599BTC@10287.1EUR\nSEL...
2       With this project that consists of a good team...
3       Join the faucet hub for free bitcoin cryptocur...
4       Roses are red,\nViolets are blue,\nAnd Craig W...
5       @Citi This comes up in Crypto twitter. Keep tr...
6       Looks like an inverted head &amp; shoulders pa...
7       Bitcoin Price Prediction And Analysis For June...
8       The irony of Bitcoin: a decentralized currency...
9       Update: I've invested all of the savings in bi...
10      Take the Love Compatibility Test! https://t.co...
11      Bitmain moves ahead with employee-options plan...
12      XRPL Labs’ Wietse Wind hints at ‘XRPL-bridged ...
13      My BitcoinCard story! \n@CoinbaseCard @tenxwal...
14      Order your secure and smart BTC/ETH/Altcoin ha...
15      Bitcoin market grew ‘independently’ in q2 2019...
16      crypto cryptocurrency bitcoin Loon's balloon b...
17      Bitcoi

In [6]:
#Convert a Pandas Dataframe into a Pyspark Dataframe

mySchema = StructType([ StructField("DateTime", StringType(), True)\
                       ,StructField("Tweet", StringType(), True)\
                       ,StructField("Handle", StringType(), True)\
                       ,StructField("Followers", IntegerType(), True)\
                       ,StructField("Location", StringType(), True)\
                       ,StructField("Retweet Count", IntegerType(), True)\
                       ,StructField("Sentiment", IntegerType(), True)\
                       ,StructField("Clean Tweets", StringType(), True)])

df = spark.createDataFrame(df,schema=mySchema)

In [7]:
# Tokenize dataframe
review_data = Tokenizer(inputCol="Clean Tweets", outputCol="Words")

In [8]:
# Transform dataframe
reviewed = review_data.transform(df)
reviewed.show()

+-------------------+--------------------+---------------+---------+--------------------+-------------+---------+--------------------+--------------------+
|           DateTime|               Tweet|         Handle|Followers|            Location|Retweet Count|Sentiment|        Clean Tweets|               Words|
+-------------------+--------------------+---------------+---------+--------------------+-------------+---------+--------------------+--------------------+
|2019-07-06 17:23:51|$BCH and $BSV #ho...|   KoalaCryptos|      564|            The Moon|            0|        1|$BCH and $BSV hod...|[$bch, and, $bsv,...|
|2019-07-06 17:23:43|LAST TRADE: SELL ...|  digital_mine_|     5750|    STEEM BLOCKCHAIN|            0|        1|LAST TRADE: SELL ...|[last, trade:, se...|
|2019-07-06 17:23:05|With this project...|  maruf07388605|      734|          Bangladesh|            0|        2|With this project...|[with, this, proj...|
|2019-07-06 17:23:02|Join the faucet h...|     BitcoinTap|      

In [9]:
# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="Filtered")

In [10]:
# Transform new dataframe
newFrame = remover.transform(reviewed)
newFrame.show()

+-------------------+--------------------+---------------+---------+--------------------+-------------+---------+--------------------+--------------------+--------------------+
|           DateTime|               Tweet|         Handle|Followers|            Location|Retweet Count|Sentiment|        Clean Tweets|               Words|            Filtered|
+-------------------+--------------------+---------------+---------+--------------------+-------------+---------+--------------------+--------------------+--------------------+
|2019-07-06 17:23:51|$BCH and $BSV #ho...|   KoalaCryptos|      564|            The Moon|            0|        1|$BCH and $BSV hod...|[$bch, and, $bsv,...|[$bch, $bsv, hodl...|
|2019-07-06 17:23:43|LAST TRADE: SELL ...|  digital_mine_|     5750|    STEEM BLOCKCHAIN|            0|        1|LAST TRADE: SELL ...|[last, trade:, se...|[last, trade:, se...|
|2019-07-06 17:23:05|With this project...|  maruf07388605|      734|          Bangladesh|            0|        2|Wi

In [11]:
# Show simplified review
newFrame.select("Filtered").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|Filtered                                                                                                                                                  |
+----------------------------------------------------------------------------------------------------------------------------------------------------------+
|[$bch, $bsv, hodler..., smh, , 🤣🤣🤣🤣, , crypto, bitcoin, cash, fakesatoshi, https://t.co/dzjdqx0yka]                                                   |
|[last, trade:, sell, 0.03562599btc@10287.1eur, sell, wall:, 37.004btc@10850.0eur, buy, wall:, 71.238btc@9600.0eur, sum, shorts:…, https://t.co/efcy1zr0vy]|
|[project, consists, good, team, able, give, lot, money, protect, crypto, safe, worl…, https://t.co/b3e6pwh23l]                                            |
|[join, faucet, hub, free, bitcoin, cryptocurrency, sites!, ht

In [12]:
clean_set = newFrame.toPandas()

csv_file = clean_set.to_csv('Output/Clean_set.csv',index=None, header=True)

In [13]:
# Stop Spark
spark.stop()