# Libraries

In [1]:
from pyspark.sql import SparkSession
import sparknlp

spark = SparkSession.builder \
    .appName("CassandraIntegration") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

In [2]:
import time
start_time = time.time()
df = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="tweets", keyspace="tweets") \
    .load()

end_time = time.time()
print(f"Time taken to import dataset: {end_time - start_time} seconds.")


Time taken to import dataset: 11.111948728561401 seconds.


In [3]:
# df = df.sample(False, 0.1)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+--------+-------------------+--------+----------+--------------------+---------------+
|sequence|              dates|    flag|       ids|                text|           user|
+--------+-------------------+--------+----------+--------------------+---------------+
|  432373|2009-06-07 08:02:44|NO_QUERY|2064736572|"uh...2morrow vac...|       Nessa128|
|  780948|2009-06-25 01:20:18|NO_QUERY|2323301700|"@toeekneee why??...|    DaniFeldman|
|   83512|2009-05-10 02:16:40|NO_QUERY|1753364328|"twisted my ankle...|     saysjessie|
|   62129|2009-05-03 08:24:01|NO_QUERY|1686978943|"Heading to Magic...| foldinglaundry|
|  760443|2009-06-23 10:35:54|NO_QUERY|2296772072|"@willpug i would...|        Mizannn|
|   73918|2009-05-04 04:36:59|NO_QUERY|1694661852|"@Void_Shanghai i...|        duzkiez|
|  226995|2009-05-30 23:07:22|NO_QUERY|1977969653|"I feel sad. I wa...|          BAMFx|
|  817205|2009-04-18 10:42:00|NO_QUERY|1551682133|"@mya152 Marks he...|       giuli272|
|  104285|2009-05-16 20:34:00|NO

                                                                                

In [4]:
df.printSchema()

root
 |-- sequence: integer (nullable = false)
 |-- dates: timestamp (nullable = true)
 |-- flag: string (nullable = true)
 |-- ids: long (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)



In [5]:
from pyspark.sql.functions import col
start_time = time.time()

df = df.orderBy(col("sequence"))
df.show()

end_time = time.time()
print(f"Time taken to perform: {end_time - start_time} seconds.")

                                                                                

+--------+-------------------+--------+----------+--------------------+---------------+
|sequence|              dates|    flag|       ids|                text|           user|
+--------+-------------------+--------+----------+--------------------+---------------+
|       0|2009-04-06 23:19:45|NO_QUERY|1467810369|"@switchfoot http...|_TheSpecialOne_|
|       1|2009-04-06 23:19:49|NO_QUERY|1467810672|"is upset that he...|  scotthamilton|
|       2|2009-04-06 23:19:53|NO_QUERY|1467810917|"@Kenichan I dive...|       mattycus|
|       3|2009-04-06 23:19:57|NO_QUERY|1467811184|"my whole body fe...|        ElleCTF|
|       4|2009-04-06 23:19:57|NO_QUERY|1467811193|"@nationwideclass...|         Karoli|
|       5|2009-04-06 23:20:00|NO_QUERY|1467811372|"@Kwesidei not th...|       joy_wolf|
|       6|2009-04-06 23:20:03|NO_QUERY|1467811592|       "Need a hug "|        mybirch|
|       7|2009-04-06 23:20:03|NO_QUERY|1467811594|"@LOLTrish hey  l...|           coZZ|
|       8|2009-04-06 23:20:05|NO

In [6]:
start_time = time.time()

df = df.drop("_id", "flag", "ids", "user", "date", "sequence")
df.show()

end_time = time.time()
print(f"Time taken to perform: {end_time - start_time} seconds.")



+-------------------+--------------------+
|              dates|                text|
+-------------------+--------------------+
|2009-04-06 23:19:45|"@switchfoot http...|
|2009-04-06 23:19:49|"is upset that he...|
|2009-04-06 23:19:53|"@Kenichan I dive...|
|2009-04-06 23:19:57|"my whole body fe...|
|2009-04-06 23:19:57|"@nationwideclass...|
|2009-04-06 23:20:00|"@Kwesidei not th...|
|2009-04-06 23:20:03|       "Need a hug "|
|2009-04-06 23:20:03|"@LOLTrish hey  l...|
|2009-04-06 23:20:05|"@Tatiana_K nope ...|
|2009-04-06 23:20:09|"@twittera que me...|
|2009-04-06 23:20:16|"spring break in ...|
|2009-04-06 23:20:17|"I just re-pierce...|
|2009-04-06 23:20:19|"@caregiving I co...|
|2009-04-06 23:20:19|"@octolinz16 It i...|
|2009-04-06 23:20:20|"@smarrison i wou...|
|2009-04-06 23:20:20|"@iamjazzyfizzle ...|
|2009-04-06 23:20:22|"Hollis' death sc...|
|2009-04-06 23:20:25|"about to file ta...|
|2009-04-06 23:20:31|"@LettyA ahh ive ...|
|2009-04-06 23:20:34|"@FakerPattyPattz...|
+----------

                                                                                

In [7]:
from pyspark.sql import functions as F
start_time = time.time()
# Counting missing values for each column
missing_counts = df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns])

missing_counts.show()

end_time = time.time()
print(f"Time taken to perform: {end_time - start_time} seconds.")



+-----+----+
|dates|text|
+-----+----+
|    0|   0|
+-----+----+

Time taken to perform: 38.47193002700806 seconds.


                                                                                

In [8]:
start_time = time.time()

# Count entries with URLs
url_count = df.filter(F.col("text").rlike("http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?")).count()

# Count entries with HTML tags
html_tags_count = df.filter(F.col("text").rlike("<[^>]+>")).count()

# Count entries with mentions (@username)
mentions_count = df.filter(F.col("text").rlike("@\\w+")).count()

print(f"Number of entries with URLs: {url_count}")
print(f"Number of entries with HTML tags: {html_tags_count}")
print(f"Number of entries with mentions: {mentions_count}")

end_time = time.time()
print(f"Time taken to perform: {end_time - start_time} seconds.")



Number of entries with URLs: 70068
Number of entries with HTML tags: 0
Number of entries with mentions: 738491
Time taken to perform: 110.67055130004883 seconds.


                                                                                

In [9]:
from pyspark.sql import functions as F
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import lower
start_time = time.time()

# Remove URLs
df = df.withColumn("text", F.regexp_replace(F.col("text"), "http(s)?://[^\\s]+", ""))

# Remove HTML tags
df = df.withColumn("text", F.regexp_replace(F.col("text"), "<[^>]+>", ""))

# Remove mentions (i.e., @username)
df = df.withColumn("text", F.regexp_replace(F.col("text"), "@\\w+", ""))

# Convert to lowercase
df = df.withColumn('text', lower(df['text']))

# Remove numbers from the "text" column
df = df.withColumn('text', regexp_replace(df['text'], r'\d+', ''))

# Reduce excessive characters (more than two of the same in a row)
df = df.withColumn('text', regexp_replace('text', r'(.)\1{2,}', r'\1\1'))

df.select("text").show(truncate=False)
end_time = time.time()
print(f"Time taken to perform: {end_time - start_time} seconds.")



+----------------------------------------------------------------------------------------------------------------+
|text                                                                                                            |
+----------------------------------------------------------------------------------------------------------------+
|"  - a11, that's a bummer.  you shoulda got david carr of third day to do it. ;d"                               |
|"is upset that he can't update his facebook by texting it11 and might cry as a result  school today also. blah!"|
|" i dived many times for the ball. managed to save %  the rest go out of bounds"                                |
|"my whole body feels itchy and like its on fire "                                                               |
|" no, it's not behaving at all. i'm mad. why am i here? because i can't see you all over there. "               |
|" not the whole crew "                                                         

                                                                                

- The remaining characters will be erased by removing punctuations in the following steps.

In [10]:
import contractions
import json

# Extract contractions and save to a JSON file
with open("contractions.json", "w") as f:
    json.dump(contractions.contractions_dict, f)


In [11]:
from pyspark.sql.functions import col, regexp_replace
import re
start_time = time.time()
# Load contractions from the JSON file
with open("contractions.json", "r") as f:
    contractions_dict = json.load(f)

for contraction, expansion in contractions_dict.items():
    # Ensure regex special characters are escaped
    contraction_regex = re.escape(contraction)
    
    # Use word boundaries and case-insensitive match
    pattern = r"(?i)\b{}\b".format(contraction_regex)
    
    df = df.withColumn("text", regexp_replace(col("text"), pattern, expansion))
    
df.select("text").show(truncate=False)
end_time = time.time()
print(f"Time taken to perform: {end_time - start_time} seconds.")

                                                                                

+-----------------------------------------------------------------------------------------------------------------+
|text                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|"  - a11, that is a bummer.  you shoulda got david carr of third day to do it. ;d"                               |
|"is upset that he cannot update his facebook by texting it11 and might cry as a result  school today also. blah!"|
|" i dived many times for the ball. managed to save %  the rest go out of bounds"                                 |
|"my whole body feels itchy and like its on fire "                                                                |
|" no, it is not behaving at all. I am mad. why am i here? because i cannot see you all over there. "             |
|" not the whole crew "                                                 

In [12]:
# Remove punctuation
df = df.withColumn('text', regexp_replace(df['text'], r"[^\w\s]", ""))
df.select("text").show(truncate=False)

                                                                                

+-------------------------------------------------------------------------------------------------------------+
|text                                                                                                         |
+-------------------------------------------------------------------------------------------------------------+
|   a11 that is a bummer  you shoulda got david carr of third day to do it d                                  |
|is upset that he cannot update his facebook by texting it11 and might cry as a result  school today also blah|
| i dived many times for the ball managed to save   the rest go out of bounds                                 |
|my whole body feels itchy and like its on fire                                                               |
| no it is not behaving at all I am mad why am i here because i cannot see you all over there                 |
| not the whole crew                                                                                    

In [13]:
from sparknlp.annotator import *
from sparknlp.base import *

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

# 5. Context-aware Spell Checker
spellChecker = NorvigSweetingModel.pretrained() \
    .setInputCols(["token"]) \
    .setOutputCol("corrected")

# Use GloVe embeddings (or any other word embeddings that you prefer)
embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
    .setInputCols(["sentence", "corrected"]) \
    .setOutputCol("embeddings")

# Convert word embeddings to sentence embeddings
sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["sentence", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

# Use the SentimentDLModel compatible with GloVe embeddings
sentiment_detector = SentimentDLModel.pretrained('sentimentdl_glove_imdb', 'en') \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setThreshold(0.7) \

# Finisher to convert annotations to DataFrame columns
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeMetadata(True)  # Set to True to include metadata (which should include scores)

pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    spellChecker,
    embeddings,
    sentence_embeddings,
    sentiment_detector,
    finisher
])

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[ / ]spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[ | ]Download done! Loading the resource.
[ / ]



[ — ]



[ \ ]

                                                                                

[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[ / ]glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[ / ]Download done! Loading the resource.
[OK!]
sentimentdl_glove_imdb download started this may take some time.
Approximate size to download 8.7 MB
[ — ]sentimentdl_glove_imdb download started this may take some time.
Approximate size to download 8.7 MB
[ | ]Download done! Loading the resource.
[ / ]

                                                                                

[ \ ]

2023-11-03 23:04:17.513512: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


from sparknlp.annotator import *
from sparknlp.base import *
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# Load TensorFlow Hub Universal Sentence Encoder embeddings
use_embeddings = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")

# Assuming 'sentimentdl_use_twitter' expects USE embeddings, which should be compatible with the lite version
sentiment_detector = SentimentDLModel.pretrained('sentimentdl_use_twitter', 'en') \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setThreshold(0.7)

finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeMetadata(True)  # Set to True to include metadata (which should include scores)

pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    use_embeddings,
    sentiment_detector,
    finisher
])


In [14]:
# Apply the pipeline to the DataFrame
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)

# Display the results
df.select("text","finished_sentiment", "finished_sentiment_metadata").show(truncate=False)

                                                                                

+-------------------------------------------------------------------------------------------------------------+------------------+-------------------------------------------------------+
|text                                                                                                         |finished_sentiment|finished_sentiment_metadata                            |
+-------------------------------------------------------------------------------------------------------------+------------------+-------------------------------------------------------+
|   a11 that is a bummer  you shoulda got david carr of third day to do it d                                  |[pos]             |[{sentence, 0}, {pos, 0.97958183}, {neg, 0.020418175}] |
|is upset that he cannot update his facebook by texting it11 and might cry as a result  school today also blah|[neg]             |[{sentence, 0}, {pos, 0.18526705}, {neg, 0.81473297}]  |
| i dived many times for the ball managed to save   the rest go o

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

# Extract positive and negative scores
df = df.withColumn("positive_score", expr("filter(finished_sentiment_metadata, x -> x._1 == 'pos')[0]._2"))
df = df.withColumn("negative_score", expr("filter(finished_sentiment_metadata, x -> x._1 == 'neg')[0]._2"))

# Calculate sentiment score by subtracting the negative score from the positive score
df = df.withColumn("sentiment_score", col("positive_score") - col("negative_score"))

# Show the resulting DataFrame with sentiment score
df.select("dates", "text", "finished_sentiment", "sentiment_score").show()


                                                                                

+-------------------+--------------------+------------------+-------------------+
|              dates|                text|finished_sentiment|    sentiment_score|
+-------------------+--------------------+------------------+-------------------+
|2009-04-06 23:19:45|   a11 that is a ...|             [pos]|        0.959163655|
|2009-04-06 23:19:49|is upset that he ...|             [neg]|        -0.62946592|
|2009-04-06 23:19:53| i dived many tim...|             [neg]|     -0.99927599196|
|2009-04-06 23:19:57|my whole body fee...|             [pos]|       0.9980273314|
|2009-04-06 23:19:57| no it is not beh...|             [neg]|-0.9951457286000001|
|2009-04-06 23:20:00| not the whole crew |             [neg]|      -0.9998317417|
|2009-04-06 23:20:03|         need a hug |             [pos]|     0.999971855195|
|2009-04-06 23:20:03| hey  long time n...|             [neg]|        -0.80194189|
|2009-04-06 23:20:05| nope they did no...|             [neg]|    -0.999997791358|
|2009-04-06 23:2

In [16]:
df.printSchema()

root
 |-- dates: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- finished_sentiment: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- finished_sentiment_metadata: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _1: string (nullable = true)
 |    |    |-- _2: string (nullable = true)
 |-- positive_score: string (nullable = true)
 |-- negative_score: string (nullable = true)
 |-- sentiment_score: double (nullable = true)



In [17]:
num_rows = df.count() # Counting the number of rows.
num_columns =len(df.columns) # Length of columns.
print(f"Shape: ({num_rows}, {num_columns})") # Prints the shape of the dataset.



Shape: (1600000, 7)


                                                                                