In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes 
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [4]:
spark = SparkSession.builder.appName("BinaryStringClassification").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/16 18:56:08 WARN Utils: Your hostname, Omars-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.45 instead (on interface en0)
25/08/16 18:56:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/16 18:56:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
df = spark.read.csv('sentiment_analysis.csv', header=True)

In [8]:
df = df.withColumn("label", col("label").cast("double"))

In [10]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- tweet: string (nullable = true)



In [12]:
df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 5894|
|  1.0| 2026|
+-----+-----+



In [14]:
from pyspark.sql.functions import col, when, count

df.select(
    count(when(col("tweet").isNull() | (col("tweet") == ""), "tweet")).alias("null_or_empty_tweets")
).show()


+--------------------+
|null_or_empty_tweets|
+--------------------+
|                   0|
+--------------------+



In [16]:
from pyspark.sql.functions import col, lower, regexp_replace, trim

df = df.withColumn(
    "tweet",
    trim(
        regexp_replace(
            regexp_replace(
                lower(col("tweet")),             # lowercase
                r"http\S+|@\w+|#\w+|[^a-zA-Z\s]", ""  # remove URLs, mentions, hashtags, special chars
            ),
            r"\bpictwitter\w+\b", ""             # remove 'pictwitter...'
        )
    )
)

# Optional: replace multiple spaces with a single space
df = df.withColumn("tweet", regexp_replace(col("tweet"), r"\s{2,}", " "))


In [18]:
df.select('tweet').show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+
|               tweet|
+--------------------+
|                test|
|finally a transpa...|
|we love this woul...|
|im wired i know i...|
|what amazing serv...|
|iphone software u...|
|        happy for us|
|new type c charge...|
|bout to go shoppi...|
|               photo|
|hey when you make...|
|ha not heavy mach...|
|contemplating giv...|
|i just made anoth...|
|the battery is so...|
|        from towards|
|like and share if...|
|            go crazy|
|the reason i dont...|
|how is the apple ...|
+--------------------+
only showing top 20 rows


                                                                                

In [20]:
from pyspark.sql.functions import length
df=df.withColumn('length',length(df['tweet']))

In [22]:
df.show()

+---+-----+--------------------+------+
| id|label|               tweet|length|
+---+-----+--------------------+------+
|  1|  0.0|                test|     4|
|  2|  0.0|finally a transpa...|    53|
|  3|  0.0|we love this woul...|    25|
|  4|  0.0|im wired i know i...|    45|
|  5|  1.0|what amazing serv...|   114|
|  6|  1.0|iphone software u...|    65|
|  7|  0.0|        happy for us|    12|
|  8|  0.0|new type c charge...|    45|
|  9|  0.0|bout to go shoppi...|    44|
| 10|  0.0|               photo|     5|
| 11|  1.0|hey when you make...|   114|
| 12|  1.0|ha not heavy mach...|    93|
| 13|  1.0|contemplating giv...|    82|
| 14|  0.0|i just made anoth...|    75|
| 15|  1.0|the battery is so...|    80|
| 16|  0.0|        from towards|    12|
| 17|  0.0|like and share if...|    55|
| 18|  0.0|            go crazy|     8|
| 19|  1.0|the reason i dont...|    46|
| 20|  1.0|how is the apple ...|    90|
+---+-----+--------------------+------+
only showing top 20 rows


In [26]:
df.show()

+---+-----+--------------------+------+
| id|label|               tweet|length|
+---+-----+--------------------+------+
|  1|  0.0|                test|     4|
|  2|  0.0|finally a transpa...|    53|
|  3|  0.0|we love this woul...|    25|
|  4|  0.0|im wired i know i...|    45|
|  5|  1.0|what amazing serv...|   114|
|  6|  1.0|iphone software u...|    65|
|  7|  0.0|        happy for us|    12|
|  8|  0.0|new type c charge...|    45|
|  9|  0.0|bout to go shoppi...|    44|
| 10|  0.0|               photo|     5|
| 11|  1.0|hey when you make...|   114|
| 12|  1.0|ha not heavy mach...|    93|
| 13|  1.0|contemplating giv...|    82|
| 14|  0.0|i just made anoth...|    75|
| 15|  1.0|the battery is so...|    80|
| 16|  0.0|        from towards|    12|
| 17|  0.0|like and share if...|    55|
| 18|  0.0|            go crazy|     8|
| 19|  1.0|the reason i dont...|    46|
| 20|  1.0|how is the apple ...|    90|
+---+-----+--------------------+------+
only showing top 20 rows


In [28]:
tokenizer = Tokenizer(inputCol="text", outputCol="toknized_words")


In [30]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

25/08/16 18:58:43 WARN StopWordsRemover: Default locale set was [en_EG]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [52]:
hashingTF = HashingTF(inputCol="filtered_words", outputCol="features", numFeatures=1000)

In [72]:
lr = LogisticRegression(featuresCol="features", labelCol="label")
nv=NaiveBayes(featuresCol="features",labelCol="label")

In [78]:
train_df.show()

+----+-----+--------------------+------+
|  id|label|               tweet|length|
+----+-----+--------------------+------+
|   1|  0.0|                test|     4|
|  10|  0.0|               photo|     5|
|1000|  1.0|ive gone thru fou...|    42|
|1001|  0.0|            canteras|     8|
|1002|  0.0|                cake|     4|
|1004|  0.0|thanks for my del...|    95|
|1007|  0.0|rt this if you th...|    27|
|1008|  1.0|had my iphone day...|    57|
|1009|  0.0|thanks for follow...|    38|
|1012|  1.0|anyone know why m...|    73|
|1013|  0.0|     iphone birthday|    15|
|1014|  0.0|users reached tha...|    37|
|1016|  0.0|  easter john clarke|    18|
|1018|  1.0|really upset that...|    66|
|1020|  1.0|whyyyy you cant d...|    73|
|1021|  1.0|the keyboard in t...|   103|
|1022|  0.0|      my lovely girl|    14|
|1026|  0.0|two more cases to...|    39|
|1028|  0.0|soooooo whos a sp...|    59|
|1030|  0.0|one of my faves f...|    39|
+----+-----+--------------------+------+
only showing top

In [76]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

In [106]:
train_df = train_df.filter(col("tweet").isNotNull() & (col("tweet") != ""))


tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashing = HashingTF(inputCol="filtered", outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")

pipeline = Pipeline(stages=[tokenizer, remover, hashing, lr])

model = pipeline.fit(train_df)

25/08/16 19:28:40 WARN StopWordsRemover: Default locale set was [en_EG]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [107]:
predictions = model.transform(test_df)  # or train_df
predictions.select("tweet", "label", "prediction").show(10)


+--------------------+-----+----------+
|               tweet|label|prediction|
+--------------------+-----+----------+
|   bado con with and|  0.0|       0.0|
|overrated waste o...|  1.0|       1.0|
|       like a geisha|  0.0|       0.0|
|so i can log in t...|  1.0|       1.0|
|effect created st...|  0.0|       0.0|
|my apple was so h...|  0.0|       0.0|
|johnson will he s...|  0.0|       0.0|
|i might have to v...|  1.0|       0.0|
|get off when you ...|  0.0|       0.0|
|get off apple itu...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 10 rows


In [108]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")

# Confusion matrix
pred_rdd = predictions.select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = MulticlassMetrics(pred_rdd)
print("Confusion Matrix:\n", metrics.confusionMatrix().toArray())


Accuracy: 0.8221
Confusion Matrix:
 [[1502.  215.]
 [ 189.  365.]]


In [120]:
predictions.show()

+----+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  id|label|               tweet|length|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+----+-----+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| 100|    0|   bado con with and|    17|[bado, con, with,...|         [bado, con]|(262144,[115902,1...|[-24.706156705129...|[0.72867774066642...|       0.0|
|1003|    1|overrated waste o...|    58|[overrated, waste...|[overrated, waste...|(262144,[41809,83...|[-74.649254747980...|[0.77430082891837...|       0.0|
|1005|    0|       like a geisha|    13|   [like, a, geisha]|      [like, geisha]|(262144,[133592,2...|[-20.464829952558...|[0.85170175244842...|       0.0|
|1006|    1|so i can log in t...|    97|[so, i, can, log,.

25/08/14 15:23:11 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


In [122]:
output_df = predictions.select("id", "tweet", "label", "prediction", "probability")


In [68]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

# 1. Tokenize tweets
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")

# 2. Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# 3. HashingTF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=1000)

# 4. IDF (to weight the features)
idf = IDF(inputCol="raw_features", outputCol="features")

# Build pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])

# Fit and transform
model = pipeline.fit(df)       # fit = learn IDF weights
new   = model.transform(df)    # transform = apply all stages

# Show results
new.select("tweet", "filtered_words", "features").show()


25/08/16 19:07:08 WARN StopWordsRemover: Default locale set was [en_EG]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


+--------------------+--------------------+--------------------+
|               tweet|      filtered_words|            features|
+--------------------+--------------------+--------------------+
|                test|              [test]|(1000,[586],[5.57...|
|finally a transpa...|[finally, transpa...|(1000,[161,229,34...|
|we love this woul...|          [love, go]|(1000,[240,451],[...|
|im wired i know i...|[im, wired, know,...|(1000,[344,349,39...|
|what amazing serv...|[amazing, service...|(1000,[0,44,217,2...|
|iphone software u...|[iphone, software...|(1000,[0,48,66,15...|
|        happy for us|         [happy, us]|(1000,[347,660],[...|
|new type c charge...|[new, type, c, ch...|(1000,[34,357,526...|
|bout to go shoppi...|[bout, go, shoppi...|(1000,[370,409,45...|
|               photo|             [photo]|(1000,[367],[2.84...|
|hey when you make...|[hey, make, new, ...|(1000,[54,64,92,2...|
|ha not heavy mach...|[ha, heavy, machi...|(1000,[112,492,53...|
|contemplating giv...|[co

In [112]:
# Convert to Pandas
pdf = predictions.select("tweet", "label", "prediction", "probability").toPandas()


In [114]:
csv_path = "predictions.csv"
pdf.to_csv(csv_path, index=False)


In [130]:

try:
    response = s3.list_buckets()
    print("✅ Connected to S3!")
    print("Your Buckets:")
    for bucket in response["Buckets"]:
        print(f" - {bucket['Name']}")
except Exception as e:
    print("❌ Not connected to S3:", e)


✅ Connected to S3!
Your Buckets:
 - omar93tweetdata


In [132]:
bucket_name = "omar93tweetdata"
s3.upload_file(csv_path, bucket_name, "predictions.csv")

print("✅ File uploaded to S3 successfully!")


✅ File uploaded to S3 successfully!
