In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SentimentakAnalysis").getOrCreate()

In [0]:
from pyspark import SparkFiles
# Load in user_data.csv from S3 into a DataFrame
url = "https://corona-tweets-04-10.s3.us-east-2.amazonaws.com/2020-03-30+Coronavirus+Tweets.CSV"
spark.sparkContext.addFile(url)

df = spark.read.option('header', 'true').csv(SparkFiles.get("2020-03-30+Coronavirus+Tweets.CSV"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")
df.show(10)

+--------------------+-------------------+--------------------+--------------+--------------------+---------+------------------+----------------+--------------------+--------+----------+----------------+-------------+------------+---------------+--------------------+---------------+-------------+------------+--------------------+--------+----+
|           status_id|            user_id|          created_at|   screen_name|                text|   source|reply_to_status_id|reply_to_user_id|reply_to_screen_name|is_quote|is_retweet|favourites_count|retweet_count|country_code|place_full_name|          place_type|followers_count|friends_count|account_lang|  account_created_at|verified|lang|
+--------------------+-------------------+--------------------+--------------+--------------------+---------+------------------+----------------+--------------------+--------+----------+----------------+-------------+------------+---------------+--------------------+---------------+-------------+-----------

## Transform DataFrame to remove unneccessary chars in tweet

In [0]:
tweet_df = df.select(["text","favourites_count","friends_count"])
tweet_df.show()

+--------------------+----------------+-------------+
|                text|favourites_count|friends_count|
+--------------------+----------------+-------------+
|#Entérate Gobiern...|             362|          991|
|#PorSiNoLoViste |...|            1764|          621|
|Pengurusan kes-ke...|            null|         null|
|                null|            null|         null|
|                null|            null|         null|
|                null|            null|         null|
|                null|            null|         null|
|                null|            null|         null|
|                null|            null|         null|
|                null|            null|           in|
|Médico cirujano, ...|            null|         null|
|                null|            null|         TRUE|
|La @NlSalud imple...|            null|         null|
|                null|            null|         null|
|                null|            null|           es|
|Para convivir bie...|      

In [0]:
from pyspark.sql.types import IntegerType

In [0]:
tweet_text_df=tweet_df.select(["text","friends_count"])

In [0]:
tweet_text_df.withColumn("friends_count", tweet_text_df["friends_count"].cast(IntegerType()))

DataFrame[text: string, friends_count: int]

In [0]:
tweet_text_df.show()

+--------------------+-------------+
|                text|friends_count|
+--------------------+-------------+
|#Entérate Gobiern...|          991|
|#PorSiNoLoViste |...|          621|
|Pengurusan kes-ke...|         null|
|                null|         null|
|                null|         null|
|                null|         null|
|                null|         null|
|                null|         null|
|                null|         null|
|                null|           in|
|Médico cirujano, ...|         null|
|                null|         TRUE|
|La @NlSalud imple...|         null|
|                null|         null|
|                null|           es|
|Para convivir bie...|         null|
|                null|         null|
|                null|         null|
|                null|         null|
|                null|         null|
+--------------------+-------------+
only showing top 20 rows



Pre-process tweets

In [0]:
from pyspark.sql.functions import regexp_replace, length

In [0]:
tweet_df1=tweet_text_df.withColumn("text_new",regexp_replace("text",'((www\.[^\s]+)|(https?://[^\s]+))', 'URL'))

In [0]:
tweet_df1.show()

+--------------------+-------------+--------------------+
|                text|friends_count|            text_new|
+--------------------+-------------+--------------------+
|#Entérate Gobiern...|          991|#Entérate Gobiern...|
|#PorSiNoLoViste |...|          621|#PorSiNoLoViste |...|
|Pengurusan kes-ke...|         null|Pengurusan kes-ke...|
|                null|         null|                null|
|                null|         null|                null|
|                null|         null|                null|
|                null|         null|                null|
|                null|         null|                null|
|                null|         null|                null|
|                null|           in|                null|
|Médico cirujano, ...|         null|Médico cirujano, ...|
|                null|         TRUE|                null|
|La @NlSalud imple...|         null|La @NlSalud imple...|
|                null|         null|                null|
|             

In [0]:
tweet_df2=tweet_df1.withColumn("text_new_1",regexp_replace("text_new",'@[^\s]+', 'AT_USER'))

In [0]:
tweet_df3=tweet_df2.withColumn("text_new_2",regexp_replace("text_new_1",r'#([^\s]+)',r'\1'))

In [0]:
tweet_df3.show()

+--------------------+-------------+--------------------+--------------------+--------------------+
|                text|friends_count|            text_new|          text_new_1|          text_new_2|
+--------------------+-------------+--------------------+--------------------+--------------------+
|#Entérate Gobiern...|          991|#Entérate Gobiern...|#Entérate Gobiern...|1 Gobierno del Es...|
|#PorSiNoLoViste |...|          621|#PorSiNoLoViste |...|#PorSiNoLoViste |...|1 | Si eres un cl...|
|Pengurusan kes-ke...|         null|Pengurusan kes-ke...|Pengurusan kes-ke...|Pengurusan kes-ke...|
|                null|         null|                null|                null|                null|
|                null|         null|                null|                null|                null|
|                null|         null|                null|                null|                null|
|                null|         null|                null|                null|                null|


In [0]:
final_tweet=tweet_df3.select(["friends_count","text_new_2"])

In [0]:
final_tweet =final_tweet.withColumnRenamed("text_new_2", "text") \
              .withColumnRenamed("friends_count", "label")

In [0]:
final_tweet1=final_tweet.filter(final_tweet.text.isNotNull())

Remove non english characters

In [0]:
from pyspark.sql.functions import udf

def ascii_ignore(x):
    return x.encode('ascii', 'ignore').decode('ascii')

ascii_udf = udf(ascii_ignore)

final_tweet2=final_tweet1.withColumn("text_1", ascii_udf('text'))

In [0]:
final_tweet3=final_tweet2.select(["label","text_1"])

In [0]:
final_tweet3=final_tweet3.withColumn('tweet_length', length(final_tweet3['text_1'])).dropna()

In [0]:
final_tweet3.show()

+-----+--------------------+------------+
|label|              text_1|tweet_length|
+-----+--------------------+------------+
|  991|1 Gobierno del Es...|         182|
|  621|1 | Si eres un cl...|         186|
|  547| 330MCO1 1 1 1 1 URL|          19|
|  212|1 1 sobre el 1 1 ...|         103|
|   75|1 | 1 En tus mano...|          67|
| 4915|To treat COVID-19...|         112|
| 1159|To treat COVID-19...|         112|
|   en|         MichiganDOT|          11|
|   22|1 1 1 1 1 1 1 1 1...|          45|
|  237|Contacta amb AT_U...|         156|
| 2287|In this hospital,...|         173|
|  851|1 Desde las 22:00...|         173|
|   53|Los ecuatorianos ...|         151|
|  125|1 1 1 Sabas que u...|         171|
| 6672|Sew a Face Mask f...|          71|
|  356|Small businesses ...|         193|
| 1027|Women make up 70%...|         137|
| 1384|Mantener el hbito...|         109|
|    0|We can all agree ...|         278|
|  747|Para conter 1 1 i...|          82|
+-----+--------------------+------

In [0]:
final_tweet4=final_tweet3.select("text_1")

In [0]:
final_tweet4.show()

+--------------------+
|              text_1|
+--------------------+
|1 Gobierno del Es...|
|1 | Si eres un cl...|
| 330MCO1 1 1 1 1 URL|
|1 1 sobre el 1 1 ...|
|1 | 1 En tus mano...|
|To treat COVID-19...|
|To treat COVID-19...|
|         MichiganDOT|
|1 1 1 1 1 1 1 1 1...|
|Contacta amb AT_U...|
|In this hospital,...|
|1 Desde las 22:00...|
|Los ecuatorianos ...|
|1 1 1 Sabas que u...|
|Sew a Face Mask f...|
|Small businesses ...|
|Women make up 70%...|
|Mantener el hbito...|
|We can all agree ...|
|Para conter 1 1 i...|
+--------------------+
only showing top 20 rows



In [0]:
final_tweet4.write.csv('/Users/kuttu490/Downloads/tweets.csv')

In [0]:
from google.colab import drive

In [0]:
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
final_tweet4.write.csv('tweet1_new.csv')
!cp -r tweet1_new.csv "drive/My Drive/"

cp: cannot create directory 'drive/My Drive/': No such file or directory


In [0]:
final_tweet3.withColumn("label", final_tweet3["label"].cast(IntegerType()))

DataFrame[label: int, text_1: string, tweet_length: int]