In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("CassandraIntegration") \
    .config("spark.cassandra.connection.host", "127.0.0.1") \
    .config("spark.cassandra.connection.port", "9042") \
    .getOrCreate()

In [2]:
import time
start_time = time.time()

df = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="tweets", keyspace="tweets") \
    .load()

end_time = time.time()

print(f"Time taken to import dataset: {end_time - start_time} seconds.")


Time taken to import dataset: 12.620269775390625 seconds.


In [3]:
sampled_df = df.sample(withReplacement=False, fraction=0.01)

In [4]:
sampled_df.show()

                                                                                

+--------+-------------------+--------+----------+--------------------+---------------+
|sequence|              dates|    flag|       ids|                text|           user|
+--------+-------------------+--------+----------+--------------------+---------------+
| 1318682|2009-06-03 02:24:16|NO_QUERY|2014399896|"baby c ulater. =...|     Mc_Rhi_Rhi|
| 1259344|2009-06-01 18:58:26|NO_QUERY|1998088014|"@savannahflower ...|  amazingbianca|
| 1260717|2009-06-01 19:32:19|NO_QUERY|1998432163|"@LisaGemini nigh...|    SashaHalima|
|  391292|2009-06-06 09:15:26|NO_QUERY|2054875139|"revising for my ...|      melisaaxx|
| 1151159|2009-05-31 00:39:42|NO_QUERY|1978534140|"Having an amazin...|  hillarymiller|
|  378678|2009-06-06 00:51:02|NO_QUERY|2052162114|"@Dave_Chappelle ...|Luciddreamrc123|
| 1383545|2009-06-06 02:13:09|NO_QUERY|2052578229|"Right off to LGB...|     NUS_Elaine|
| 1494376|2009-06-07 17:19:20|NO_QUERY|2069677585|"@TheWilson77 wel...|   pixeltickler|
|  643258|2009-06-19 02:55:07|NO

In [5]:
sampled_df.printSchema()

root
 |-- sequence: integer (nullable = false)
 |-- dates: timestamp (nullable = true)
 |-- flag: string (nullable = true)
 |-- ids: long (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)



In [6]:
from pyspark.sql.functions import col

sampled_df = sampled_df.orderBy(col("sequence"))
sampled_df.show()



+--------+-------------------+--------+----------+--------------------+---------------+
|sequence|              dates|    flag|       ids|                text|           user|
+--------+-------------------+--------+----------+--------------------+---------------+
|      21|2009-04-06 23:20:38|NO_QUERY|1467813992|"one of my friend...|     swinspeedx|
|      32|2009-04-06 23:21:09|NO_QUERY|1467815988|"thought sleeping...|       merisssa|
|      66|2009-04-06 23:23:23|NO_QUERY|1467824664|"Damm back to sch...|     a_mariepyt|
|      90|2009-04-06 23:26:10|NO_QUERY|1467835345|"@Hollywoodheat I...|     RU_it_girl|
|     117|2009-04-06 23:27:16|NO_QUERY|1467839450|"ugh. cant sleep....|  BreannaBonana|
|     160|2009-04-06 23:30:54|NO_QUERY|1467853356|"Picked Mich St t...|       dbmendel|
|     574|2009-04-06 23:58:33|NO_QUERY|1467953500|"@ballinbitch hah...|     candilaria|
|     661|2009-04-07 00:06:08|NO_QUERY|1467980858|"sad day: manu ou...|    Casey_Szulc|
|     732|2009-04-07 00:10:03|NO

                                                                                

In [7]:
sampled_df = sampled_df.drop("_id", "flag", "ids", "user", "date", "sequence")
sampled_df.show()



+-------------------+--------------------+
|              dates|                text|
+-------------------+--------------------+
|2009-04-06 23:28:23|"Was intending to...|
|2009-04-06 23:32:16|"Just woke up an ...|
|2009-04-06 23:40:26|"@ashleyskyy but ...|
|2009-04-06 23:42:49|"I swear no matte...|
|2009-04-06 23:45:08|"Any chance  Soft...|
|2009-04-07 00:11:55|"Just picked up s...|
|2009-04-07 00:12:31|"i miss being at ...|
|2009-04-07 00:25:54|"Gah, Comcast dou...|
|2009-04-07 00:38:45|"@doriantaylor I ...|
|2009-04-07 00:42:15|"i forgot how to ...|
|2009-04-07 00:43:02|"Not to self: lic...|
|2009-04-07 00:45:09|"Today was a less...|
|2009-04-07 00:45:42|"@Kimboinlimbo no...|
|2009-04-07 00:48:41|"@missoliviaa noo...|
|2009-04-07 00:51:13|"@LarrissaR pleas...|
|2009-04-07 00:59:45|"@LRon_Jaii LMAOO...|
|2009-04-07 01:19:35|"@trash_kitten  s...|
|2009-04-07 01:23:36|"@gabysslave than...|
|2009-04-07 01:27:15|"@valonthecoast L...|
|2009-04-07 01:52:36|"Blazing row with...|
+----------

                                                                                

In [8]:
from pyspark.sql.functions import isnull, count, when

# Counting missing values for each column
missing_counts = sampled_df.select([count(when(isnull(c), c)).alias(c) for c in sampled_df.columns])

missing_counts.show()



+-----+----+
|dates|text|
+-----+----+
|    0|   0|
+-----+----+



                                                                                

In [9]:
from pyspark.sql import functions as F

# Count entries with URLs
url_count = df.filter(F.col("text").rlike("http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?")).count()

# Count entries with HTML tags
html_tags_count = df.filter(F.col("text").rlike("<[^>]+>")).count()

# Count entries with mentions (@username)
mentions_count = df.filter(F.col("text").rlike("@\\w+")).count()

print(f"Number of entries with URLs: {url_count}")
print(f"Number of entries with HTML tags: {html_tags_count}")
print(f"Number of entries with mentions: {mentions_count}")



Number of entries with URLs: 70068
Number of entries with HTML tags: 0
Number of entries with mentions: 738491


                                                                                

In [10]:
from pyspark.sql import functions as F
# Remove URLs
sampled_df = sampled_df.withColumn("text", F.regexp_replace(F.col("text"), "http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?", ""))

# Remove HTML tags
sampled_df = sampled_df.withColumn("text", F.regexp_replace(F.col("text"), "<[^>]+>", ""))

# Remove mentions (i.e., @username)
sampled_df = sampled_df.withColumn("text", F.regexp_replace(F.col("text"), "@\\w+", ""))

In [11]:
# Count entries with URLs
url_count = sampled_df.filter(F.col("text").rlike("http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?")).count()

# Count entries with HTML tags
html_tags_count = sampled_df.filter(F.col("text").rlike("<[^>]+>")).count()

# Count entries with mentions (@username)
mentions_count = sampled_df.filter(F.col("text").rlike("@\\w+")).count()

print(f"Number of entries with URLs: {url_count}")
print(f"Number of entries with HTML tags: {html_tags_count}")
print(f"Number of entries with mentions: {mentions_count}")



Number of entries with URLs: 0
Number of entries with HTML tags: 0
Number of entries with mentions: 0


                                                                                

In [12]:
# If there is any special character left.
# df.filter(F.col("text").rlike("http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?")).select("text").show(truncate=False)

In [13]:
# Check for duplicate rows based on all columns
duplicate_count = sampled_df.count() - sampled_df.dropDuplicates().count()

if duplicate_count > 0:
    print(f"Number of duplicate rows: {duplicate_count}")
    # Remove duplicates and retain only the first occurrence
    sampled_df = sampled_df.dropDuplicates()
    print("Duplicates removed.")
else:
    print("No duplicates found.")



Number of duplicate rows: 58
Duplicates removed.


                                                                                

In [14]:
sampled_df.select("text").show(20, truncate=False)



+------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                          |
+------------------------------------------------------------------------------------------------------------------------------+
|" just stop it with the menu.  I did NOT get an invite. *nose in air* I don't spoil barbecues!!"                              |
|"Why are CDs in HMV so overpriced? ï¿½12.99 for ? I hate having no money "                                                    |
|"around, hills finale tonight "                                                                                               |
|" Thanks Alex! Hey everyone, check out the official 'New Moon' trailer  "                                                     |
|"Somehow glad Mine That Bird didn't win Belmont - I'd have to be pissed at Rachel Alexandra as a

                                                                                

In [15]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import LanguageDetectorDL

documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
langDetector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_21", "xx").setInputCols(["document"]).setOutputCol("lang")


ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ / ]ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[ — ]Download done! Loading the resource.
[ / ]

2023-10-29 22:07:07.304935: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [16]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[documentAssembler, langDetector])
model = pipeline.fit(sampled_df)
sampled_df = model.transform(sampled_df)

sampled_df.show()  # This will display the DataFrame with detected languages

[Stage 45:>                                                         (0 + 1) / 1]

+-------------------+--------------------+--------------------+--------------------+
|              dates|                text|            document|                lang|
+-------------------+--------------------+--------------------+--------------------+
|2009-04-07 01:30:32|"i want a guy lik...|[{document, 0, 37...|[{language, 0, 37...|
|2009-04-19 08:29:42|"Twitter API dead...|[{document, 0, 64...|[{language, 0, 64...|
|2009-04-19 09:54:02|"trying to get my...|[{document, 0, 13...|[{language, 0, 13...|
|2009-05-03 02:49:30|" He has and we a...|[{document, 0, 10...|[{language, 0, 10...|
|2009-05-17 12:11:13|"CAYSA canceled t...|[{document, 0, 47...|[{language, 0, 47...|
|2009-05-18 04:07:58|"Worried bout my ...|[{document, 0, 72...|[{language, 0, 72...|
|2009-05-18 06:09:44|"on flight from M...|[{document, 0, 76...|[{language, 0, 76...|
|2009-05-22 03:31:21|",, noo Russia?  ...|[{document, 0, 38...|[{language, 0, 38...|
|2009-05-22 04:52:27|"woke up late cos...|[{document, 0, 47...|[{

                                                                                

In [17]:
from pyspark.sql.functions import col

sampled_df = sampled_df.withColumn("detected_language", col("lang.result").getItem(0))

In [18]:
sampled_df = sampled_df.drop("document", "lang")
sampled_df.show()

[Stage 49:>                                                         (0 + 1) / 1]

+-------------------+--------------------+-----------------+
|              dates|                text|detected_language|
+-------------------+--------------------+-----------------+
|2009-04-19 08:29:42|"Twitter API dead...|               en|
|2009-05-02 01:14:25|" My loved ones a...|               en|
|2009-05-18 07:41:38|"Left my coffee a...|               en|
|2009-05-30 10:15:33|"Wishing I could ...|               en|
|2009-05-31 07:37:07|"to day is so unl...|               en|
|2009-05-31 15:16:51|"Exam week     Ca...|               en|
|2009-05-31 17:55:07|"o my! iÂ´m lossi...|               en|
|2009-06-01 06:55:39|"Kind of a rude a...|               en|
|2009-06-01 12:18:13|"In stupid busine...|               en|
|2009-06-01 18:28:41|"just finished ca...|               en|
|2009-06-02 02:43:48|" lol is it sad t...|               en|
|2009-06-02 09:50:11|"Shall I write in...|               en|
|2009-06-03 08:14:03|      "octoglob    "|               sk|
|2009-06-05 00:20:27|"Is

                                                                                

In [19]:
# Drop rows where detected_language is not 'en' from the original result DataFrame
sampled_df = sampled_df.filter(F.col("detected_language") == "en")

In [20]:

# Count the total number of observations in the filtered df_eng.
en_tweets = sampled_df.count()

print(f"Filtered count (English tweets only): {en_tweets}")



Filtered count (English tweets only): 15445


                                                                                

In [21]:
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation
sampled_df = sampled_df.withColumn('text', regexp_replace(sampled_df['text'], r"[^\w\s]", ""))

In [22]:
from pyspark.sql.functions import lower
# Convert to lowercase
sampled_df = sampled_df.withColumn('text', lower(sampled_df['text']))

In [23]:
sampled_df.select("text").show(20, truncate=False)



+---------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------+
|  you should come to the mall                                                                                              |
| nope never i sooo feel left out now  x                                                                                    |
| sure  congrats maybe your luck will rub off on me if i go with you or my bad luck will rub off on you haha                |
| moo  i messaged you then tried calling you both numbers my next step is messenger pigeons save me the cost of bread crumbs|
| oooh nicei see yr dm but i cant respond on my phone  can i email u real quick                                       

                                                                                

In [24]:
sampled_df.show(10)



+-------------------+--------------------+-----------------+
|              dates|                text|detected_language|
+-------------------+--------------------+-----------------+
|2009-05-01 21:50:48|going to bedi hav...|               en|
|2009-05-04 02:57:10| 158  hadnt train...|               en|
|2009-05-14 00:35:57|      both and more |               en|
|2009-05-18 06:00:20| the problem weve...|               en|
|2009-05-29 23:30:13|i love music vide...|               en|
|2009-05-30 02:23:01|rarebreeds farmer...|               en|
|2009-05-30 07:45:28|its freakin hott ...|               en|
|2009-05-31 14:28:18|i wish i can move...|               en|
|2009-05-31 16:39:19|my parents are ba...|               en|
|2009-06-01 23:38:16|just watched the ...|               en|
+-------------------+--------------------+-----------------+
only showing top 10 rows



                                                                                

In [None]:
from pyspark.ml.feature import RegexTokenizer

# Initialize a regex tokenizer
regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W+")

# Transform the dataset
tokens = regex_tokenizer.transform(sampled_df)
tokens.select("tokens").show(truncate=False)

In [None]:
from sparknlp.annotator import NorvigSweetingModel
spell_model = NorvigSweetingModel.pretrained().setInputCols(["tokens"]).setOutputCol("checked")

In [26]:
from sparknlp.base import DocumentAssembler

document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
sampled_df = document_assembler.transform(sampled_df)

In [27]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer
from pyspark.ml import Pipeline

# Initialize DocumentAssembler and Tokenizer
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# Create the pipeline with both stages
pipeline = Pipeline().setStages([document_assembler, tokenizer])

# Fit the pipeline model
pipeline_model = pipeline.fit(sampled_df)

# Transform the sampled data using the pipeline model
sampled_df = pipeline_model.transform(sampled_df)

# Show the result
sampled_df.show(10)




+-------------------+--------------------+-----------------+--------------------+--------------------+
|              dates|                text|detected_language|            document|               token|
+-------------------+--------------------+-----------------+--------------------+--------------------+
|2009-04-07 01:30:32|i want a guy like...|               en|[{document, 0, 35...|[{token, 0, 0, i,...|
|2009-04-17 22:06:05|when i saw the tw...|               en|[{document, 0, 12...|[{token, 0, 3, wh...|
|2009-05-02 01:14:25| my loved ones ar...|               en|[{document, 0, 30...|[{token, 1, 2, my...|
|2009-05-17 12:11:13|caysa canceled th...|               en|[{document, 0, 44...|[{token, 0, 4, ca...|
|2009-05-18 06:09:44|on flight from mk...|               en|[{document, 0, 72...|[{token, 0, 1, on...|
|2009-05-18 07:41:38|left my coffee at...|               en|[{document, 0, 22...|[{token, 0, 3, le...|
|2009-05-28 22:07:30|aaffrriiccaa   i ...|               en|[{document, 0

                                                                                

In [28]:
sampled_df.select("token").show(20, truncate=False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [None]:
spell_model = NorvigSweetingModel.pretrained().setInputCols(["tokens"]).setOutputCol("checked")

# Transform the tokenized dataset
sampled_df = spell_model.transform(tokens)
sampled_df.select("checked").show(truncate=False)

In [None]:
from pyspark.ml.feature import StopWordsRemover

# Initialize a stopwords remover
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

# Transform the tokenized data
sampled_df = remover.transform(tokens)
sampled_df.select("filtered_tokens").show(truncate=False)