In [None]:
sc

### CONNECTING TO MONGODB

In [None]:
!mongo --version

In [None]:
!mongosh

In [None]:
from pyspark.sql import SparkSession

# Create Spark Session for MongoDB
spark = SparkSession.builder \
        .appName("DFToMongoDB") \
        .getOrCreate()

# Configure MongoDB Database Connection
df.write.format("mongodb") \
    .option("uri","mongodb://127.0.0.1:27017/") \
    .option("database","sample_db") \
    .option("collection","scb") \
    .mode("append").save()

df.show(3)

### CONNECTING TO MYSQL

In [None]:
!mysql --version

In [None]:
from pyspark.sql import SparkSession

# Create Spark Session for MySQL
spark = SparkSession.builder \
    .appName("DFToMySQL") \
    .getOrCreate()

# Configure MySQL Database Connection
mysql_options = {
    "url": "jdbc:mysql://localhost:3306/sample",  # MySQL bağlantı URL'si
    "driver": "com.mysql.cj.jdbc.Driver",  # MySQL JDBC sürücüsü
    "dbtable": "yourtable",  # Hedef MySQL tablo adı
    "user": "root",  # MySQL kullanıcı adı
    "password": "password"  # MySQL parola
}

# DataFrame'i MySQL veritabanına yükleyin
df.write.format("jdbc").options(**mysql_options).mode("overwrite").save()

In [None]:
!pwd

In [None]:
!cat zahid.txt

# ================================
# READ TO CSV FROM HDFS VIA SPARK
# ================================

In [1]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
        .appName("HDFSToCSV") \
        .getOrCreate()

# Specify CSV file path throught HDFS
hdfs_file_path = "/ProjectTweets.csv"

# Read CSV file with Spark DataFrame
df = spark.read.csv(hdfs_file_path, header=False, inferSchema=True)

                                                                                

In [None]:
# Show DataFrame First 5 Rows
df.show(5)

In [None]:
df.printSchema()

In [None]:
# The first method for renamed the column names
df1 = df.withColumnRenamed("_c0", "id").withColumnRenamed("_c1", "stamp").withColumnRenamed("_c2", "date").withColumnRenamed("_c3", "flag").withColumnRenamed("_c4", "user").withColumnRenamed("_c5", "text")
df1.show(5)

In [2]:
# The second method for renamed the column names
df = df.selectExpr("_c0 as ID", "_c1 as STAMP", "_c2 as DATE", "_c3 as FLAG", "_c4 as USER", "_c5 as TEXT")
df.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
| ID|     STAMP|                DATE|    FLAG|           USER|                TEXT|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [None]:
# How many rows does the dataframe 
row_count = df.count()
# Print row_count
print("DataFrame has {} rows.".format(row_count))

In [None]:
from pyspark.sql.functions import col

columns = ["ID", "STAMP", "DATE", "FLAG", "USER", "TEXT"]

Columns = df.columns

# Check out the each column and Count unique values
for column in Columns:
    unique_values = df.select(column).distinct()
    unique_count = unique_values.count()
    
    if unique_count > 0:
        print(f"{column} has {unique_count} unique values:")
    else:
        print(f"{column} has no unique value.")

In [None]:
from pyspark.sql.functions import col

columns = ["ID", "STAMP", "DATE", "FLAG", "USER", "TEXT"]

Columns = df.columns

# Check out the each column and Count duplicate values
for column in Columns:
    count_df = df.groupBy(column).count()
    duplicate_values = count_df.filter(col("count") > 1).count()
    
    if duplicate_values > 0:
        print(f"{column} has {duplicate_values} duplicate values.")
    else:
        print(f"{column} has no duplicate value.")

In [3]:
# Drop the selected columns
df = df.drop("STAMP", "FLAG", "USER")
df.show(5)

+---+--------------------+--------------------+
| ID|                DATE|                TEXT|
+---+--------------------+--------------------+
|  0|Mon Apr 06 22:19:...|@switchfoot http:...|
|  1|Mon Apr 06 22:19:...|is upset that he ...|
|  2|Mon Apr 06 22:19:...|@Kenichan I dived...|
|  3|Mon Apr 06 22:19:...|my whole body fee...|
|  4|Mon Apr 06 22:19:...|@nationwideclass ...|
+---+--------------------+--------------------+
only showing top 5 rows



In [None]:
df.describe().show()

In [None]:
df.summary().show()

In [None]:
from pyspark.sql.functions import col

# Do a grouping and counting operation to find duplicate values in the "TEXT" column
count_df = df.groupBy("TEXT").count()

# Filter rows containing duplicate values
duplicate_values = count_df.filter(col("count") > 1)

# If there are duplicate values, show them
if duplicate_values.count() > 0:
    print("Duplicate values:")
    duplicate_values.show(truncate=False)  # Display column values in full length
else:
    print("No duplicate values found.")

In [None]:
# How many rows does the dataframe 
row_count = df.count()
# Print row_count
print("DataFrame has {} rows.".format(row_count))

In [None]:
df.printSchema()

# =====================
# TEXT PRE-PROCESSING
# =====================

In [4]:
text_df = df.select("TEXT")
text_df.show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------+
|TEXT                                                                                                               |
+-------------------------------------------------------------------------------------------------------------------+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                          |
+-------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



In [5]:
from pyspark.sql.functions import col, udf, lower, regexp_replace
from pyspark.sql.types import ArrayType, StringType, FloatType

import string
import nltk
import re
import contractions
import torch

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from textblob import TextBlob

#### TEXT CLEANING

In [6]:
import re

# Cleaning Function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = text.lower()
    return text

# Save as UDF
clean_text_udf = udf(clean_text, StringType())

# Create new column
text_df = text_df.withColumn("TEXT_C1", clean_text_udf(col("text")))
text_df.show(3, truncate=False)

[Stage 5:>                                                          (0 + 1) / 1]

+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
|TEXT                                                                                                               |TEXT_C1                                                                                                        |
+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|  - a that's a bummer.  you shoulda got david carr of third day to do it. ;d                                   |
|is upset that he can't update his Facebook by texting it... and might cry as a 

                                                                                

#### EXPAND CONTRACTIONS

In [7]:
import contractions

# Function to expand contractions
def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

# Save as UDF
expand_contractions_udf = udf(expand_contractions, StringType())

# Use the UDF and Create new column
text_df = text_df.withColumn("TEXT_C2", expand_contractions_udf(col("TEXT_C1")))

# Show the dataframe
text_df.show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+
|TEXT                                                                                                               |TEXT_C1                                                                                                        |TEXT_C2                                                                                                         |
+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------

[Stage 6:>                                                          (0 + 1) / 1]                                                                                

#### CLEAN THE PUNCTUATION CHARACTERS

In [8]:
# Define punctuation characters
punctuation_characters = r'[!\"#\$%&\'\(\)\*\+,\-./:;<=>\?@[\\]\^_`{|}~]'

# Remove punctuation characters
text_df = text_df.withColumn("TEXT_C3", regexp_replace(col("TEXT_C2"), punctuation_characters, ""))

# Show the dataframe
text_df.show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+
|TEXT                                                                                                               |TEXT_C1                                                                                                        |TEXT_C2                                                                                                         |TEXT_C3                                                                                                    |
+-------------------------------------------------------------------------------------------------

#### CLEAN THE STOPWORDS

In [9]:
# Download "stopwords" from nltk dictionary
nltk.download("stopwords")

# Configure the language as english
stop_words = set(stopwords.words("english"))

# Define the udf 
remove_stopwords_udf = udf(lambda text: " ".join([word for word in text.split() if word not in stop_words]), StringType())

# Use the UDF in order to remove stopwords and Create new column
text_df = text_df.withColumn("TEXT_C4", remove_stopwords_udf(col("TEXT_C3")))

# Show the dataframe
text_df.show(3, truncate=False)

[nltk_data] Downloading package stopwords to /home/hduser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[Stage 8:>                                                          (0 + 1) / 1]

+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+
|TEXT                                                                                                               |TEXT_C1                                                                                                        |TEXT_C2                                                                                                         |TEXT_C3                                                                                                    |TEXT_C4               

                                                                                Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


#### IMPLEMENT LEMMATIZATION

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download 'punkt','averaged_perceptron_tagger','wordnet' from nltk dictionary
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Defining the function that implements the Lemmatization operation as a UDF
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    words = word_tokenize(text)
    for w in words:
        lemma = lemmatizer.lemmatize(w)
        lemmatized_sentence.append(lemma)
    lemmatized_text = " ".join(lemmatized_sentence)
    return lemmatized_text

# Define the UDF
lemmatize_text_udf = udf(lemmatize_text, StringType())

# Use the UDF and Create new column
text_df = text_df.withColumn("TEXT_C5", lemmatize_text_udf(text_df["TEXT_C4"]))

# Show the dataframe
text_df.show(3, truncate=False)

[nltk_data] Downloading package punkt to /home/hduser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hduser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/hduser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[Stage 9:>                                                          (0 + 1) / 1]

+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|TEXT                                                                                                               |TEXT_C1                                                                                                        |TEXT_C2                                                                                                         |TEXT_C3                                             

                                                                                

#### IMPLEMENT STEMMING

In [11]:
from nltk.stem import SnowballStemmer

# Defining the function that finds word roots as UDF 
def stem_text(text):
    snow = SnowballStemmer('english')
    stemmed_sentence = []
    words = word_tokenize(text)
    for w in words:
        stemmed_sentence.append(snow.stem(w))
    stemmed_text = " ".join(stemmed_sentence)
    return stemmed_text

# Define the UDF
stem_text_udf = udf(stem_text, StringType())

# Use the UDF and Create new column
text_df = text_df.withColumn("TEXT_C6", stem_text_udf(text_df["TEXT_C5"]))

# Show the dataframe
text_df.show(3, truncate=False)

[Stage 10:>                                                         (0 + 1) / 1]

+-------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+-----------------------------------------------------------------------------+-------------------------------------------------------------------------+
|TEXT                                                                                                               |TEXT_C1                                                                                                        |TEXT_C2                                                                                    

                                                                                

#### IMPLEMENT TOKENIZATION

In [12]:
from pyspark.sql.types import ArrayType, StringType
from nltk.tokenize import word_tokenize

# Function that splits text into tokens using NLTK
def tokenize_text(text):
    words = word_tokenize(text)
    return words

# Define the UDF
tokenize_text_udf = udf(tokenize_text, ArrayType(StringType()))

# Use the UDF and Create new column
text_df = text_df.withColumn("tokens", tokenize_text_udf(text_df["TEXT_C6"]))

# Show the selected dataframe
text_df.select("TEXT_C6", "tokens").show(3, truncate=False)

[Stage 11:>                                                         (0 + 1) / 1]

+-------------------------------------------------------------------------+---------------------------------------------------------------------------------------+
|TEXT_C6                                                                  |tokens                                                                                 |
+-------------------------------------------------------------------------+---------------------------------------------------------------------------------------+
|bummer shoulda got david carr third day                                  |[bummer, shoulda, got, david, carr, third, day]                                        |
|upset can not updat facebook text might cri result school today also blah|[upset, can, not, updat, facebook, text, might, cri, result, school, today, also, blah]|
|dive mani time ball manag save 50 rest go bound                          |[dive, mani, time, ball, manag, save, 50, rest, go, bound]                             |
+---------------

                                                                                

#### IMPLEMENT TOKENIZATION AND SPLIT WORDS TO ROWS

In [None]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import explode, col

# Create The Tokenizer
tokenizer = Tokenizer(inputCol="TEXT_C6", outputCol="words")
tokenizer_df = tokenizer.transform(text_df)

# Separate words into individual lines
tokenizer_df = tokenizer_df.select(explode(col("words")).alias("word"))

# Show the dataframe
tokenizer_df.show(10, truncate=False)

#### COUNT THE TOKENIZER WORDS

In [None]:
# Count the tokenizer words
tokenizer_df_count = tokenizer_df.groupBy("word").count().orderBy("count", ascending=False)

# Show the dataframe
tokenizer_df_count.show(50)

#### SENTIMENT LABEL ( POSITIVE - NEGATIVE - NEUTRAL )

In [13]:
%%time

from textblob import TextBlob

# Sentimental Analysis Function
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Sentiment Label Function
def label_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

#=========================================================================================#

from pyspark.sql.types import FloatType, StringType

# Define the UDF with Functions
sentiment_udf = udf(get_sentiment, FloatType())
label_udf = udf(label_sentiment, StringType())

# Use the UDF and Create new columns
text_df = text_df.withColumn('sentiment_score', sentiment_udf(text_df['TEXT_C6']))
text_df = text_df.withColumn('sentiment_label', label_udf(text_df['sentiment_score']))

# Count and Show the 'sentiment_label' column
text_df.groupBy('sentiment_label').count().show()

                                                                                

+---------------+------+
|sentiment_label| count|
+---------------+------+
|       positive|582066|
|        neutral|782520|
|       negative|235414|
+---------------+------+

CPU times: user 112 ms, sys: 22.3 ms, total: 134 ms
Wall time: 23min 35s




#### The Positive, Neutral and Negative Rate

In [14]:
# Count the whole text 
total_count = text_df.count()
# Count the positive sentiment label
positive_count = text_df.filter(text_df.sentiment_label == "positive").count()
# Calculate the positive rate
positive_rate = (positive_count / total_count) * 100
# Print the positive rate
print(f"Positive rate: {positive_rate}%")



Positive rate: 36.379125%


                                                                                

In [None]:
# Count the whole text 
total_count = text_df.count()
# Count the neutral sentiment label
neutral_count = text_df.filter(text_df.sentiment_label == "neutral").count()
# Calculate the neutral rate
neutral_rate = (neutral_count / total_count) * 100
# Print the neutral rate
print(f"Neutral rate: {neutral_rate}%")

In [None]:
# Count the whole text 
total_count = text_df.count()
# Count the negative sentiment label
negative_count = text_df.filter(text_df.sentiment_label == "negative").count()
# Calculate the negative rate
negative_rate = (negative_count / total_count) * 100
# Print the negative rate
print(f"Negative rate: {negative_rate}%")

In [None]:
# Print the positive, neutral and negative rate
print(f"Positive rate: {positive_rate}%")
print(f"Neutral rate: {neutral_rate}%")
print(f"Negative rate: {negative_rate}%")

In [None]:
from pyspark.sql.functions import when, col

# Duygu etiketlerini yeni bir sütunda saklama
text_df = text_df.withColumn("sentiment_label_column", 
     when(text_df["sentiment_label"] == "positive", "positive")
    .when(text_df["sentiment_label"] == "neutral", "neutral")
    .when(text_df["sentiment_label"] == "negative", "negative")
    .otherwise("unknown")
)

# Show the selected columns
text_df.select("TEXT_C6", "sentiment_label_column").show(truncate=False)

In [15]:
%%time

# Function that measures polarity
def get_polarity(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity 

# Define UDF
polarity_udf = udf(get_polarity, FloatType())

# Calculate polarity score for each text in column 'TEXT_C6' and add to a new column
text_df = text_df.withColumn("polarity_score", polarity_udf(text_df['TEXT_C6']))

# Show the new column
text_df.select("TEXT_C6", "polarity_score").show(truncate=False)

[Stage 26:>                                                         (0 + 1) / 1]

+-------------------------------------------------------------------------+--------------+
|TEXT_C6                                                                  |polarity_score|
+-------------------------------------------------------------------------+--------------+
|bummer shoulda got david carr third day                                  |0.0           |
|upset can not updat facebook text might cri result school today also blah|0.0           |
|dive mani time ball manag save 50 rest go bound                          |0.0           |
|whole bodi feel itchi like fire                                          |0.2           |
|behav mad can not see                                                    |-0.625        |
|whole crew                                                               |0.2           |
|need hug                                                                 |0.0           |
|hey long time see yes rain bit bit lol fine thank                        |0.3888889     |

                                                                                

In [19]:
# Show the new column
text_df = text_df.select('TEXT','TEXT_C6', 'sentiment_label', 'sentiment_score').show(5)

[Stage 30:>                                                         (0 + 1) / 1]

+--------------------+--------------------+---------------+---------------+
|                TEXT|             TEXT_C6|sentiment_label|sentiment_score|
+--------------------+--------------------+---------------+---------------+
|@switchfoot http:...|bummer shoulda go...|        neutral|            0.0|
|is upset that he ...|upset can not upd...|        neutral|            0.0|
|@Kenichan I dived...|dive mani time ba...|        neutral|            0.0|
|my whole body fee...|whole bodi feel i...|       positive|            0.2|
|@nationwideclass ...|behav mad can not...|       negative|         -0.625|
+--------------------+--------------------+---------------+---------------+
only showing top 5 rows



                                                                                

In [77]:
text_df.show(3)

AttributeError: 'NoneType' object has no attribute 'show'

In [31]:
df.show(3)

+---+--------------------+--------------------+
| ID|                DATE|                TEXT|
+---+--------------------+--------------------+
|  0|Mon Apr 06 22:19:...|@switchfoot http:...|
|  1|Mon Apr 06 22:19:...|is upset that he ...|
|  2|Mon Apr 06 22:19:...|@Kenichan I dived...|
+---+--------------------+--------------------+
only showing top 3 rows



# ===============================================
# TIMESTAMP PREPARATION FOR TIME SERIES ANALYSIS
# ================================================

In [73]:
# Select the DATE column
date_df = df.select("DATE", "TEXT")

# Show the DATE column
date_df.show(truncate=False)

[Stage 72:>                                                         (0 + 1) / 1]

+----------------------------+---------------------------------------------------------------------------------------------------------------------+
|DATE                        |TEXT                                                                                                                 |
+----------------------------+---------------------------------------------------------------------------------------------------------------------+
|Mon Apr 06 22:19:45 PDT 2009|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|Mon Apr 06 22:19:49 PDT 2009|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|Mon Apr 06 22:19:53 PDT 2009|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |
|Mon Apr 06 22:19:57 PDT 2009|my whole body feels itchy and like its on fire                              

                                                                                

In [74]:
# Show the schema of the dataframe
date_df.printSchema()

root
 |-- DATE: string (nullable = true)
 |-- TEXT: string (nullable = true)



In [75]:
from pyspark.sql.functions import col, to_timestamp

# Convert to 'yyyy-MM-dd HH:mm:ss' format
date_df = date_df.withColumn("TIMESTAMP", to_timestamp(col("DATE"), "EEE MMM dd HH:mm:ss zzz yyyy"))

# Show the selected columns
date_df.select("DATE", "TIMESTAMP").show(truncate=False)

[Stage 73:>                                                         (0 + 1) / 1]

+----------------------------+-------------------+
|DATE                        |TIMESTAMP          |
+----------------------------+-------------------+
|Mon Apr 06 22:19:45 PDT 2009|2009-04-07 06:19:45|
|Mon Apr 06 22:19:49 PDT 2009|2009-04-07 06:19:49|
|Mon Apr 06 22:19:53 PDT 2009|2009-04-07 06:19:53|
|Mon Apr 06 22:19:57 PDT 2009|2009-04-07 06:19:57|
|Mon Apr 06 22:19:57 PDT 2009|2009-04-07 06:19:57|
|Mon Apr 06 22:20:00 PDT 2009|2009-04-07 06:20:00|
|Mon Apr 06 22:20:03 PDT 2009|2009-04-07 06:20:03|
|Mon Apr 06 22:20:03 PDT 2009|2009-04-07 06:20:03|
|Mon Apr 06 22:20:05 PDT 2009|2009-04-07 06:20:05|
|Mon Apr 06 22:20:09 PDT 2009|2009-04-07 06:20:09|
|Mon Apr 06 22:20:16 PDT 2009|2009-04-07 06:20:16|
|Mon Apr 06 22:20:17 PDT 2009|2009-04-07 06:20:17|
|Mon Apr 06 22:20:19 PDT 2009|2009-04-07 06:20:19|
|Mon Apr 06 22:20:19 PDT 2009|2009-04-07 06:20:19|
|Mon Apr 06 22:20:20 PDT 2009|2009-04-07 06:20:20|
|Mon Apr 06 22:20:20 PDT 2009|2009-04-07 06:20:20|
|Mon Apr 06 22:20:22 PDT 2009|2

                                                                                

In [76]:
# COnvert to 'yyyy-MM-dd' format
date_df = date_df.withColumn("YearMonthDate", col("TIMESTAMP").substr(1, 10))

# Show the selected columns
date_df.select("TIMESTAMP", "YearMonthDate").show(truncate=False)

[Stage 74:>                                                         (0 + 1) / 1]

+-------------------+-------------+
|TIMESTAMP          |YearMonthDate|
+-------------------+-------------+
|2009-04-07 06:19:45|2009-04-07   |
|2009-04-07 06:19:49|2009-04-07   |
|2009-04-07 06:19:53|2009-04-07   |
|2009-04-07 06:19:57|2009-04-07   |
|2009-04-07 06:19:57|2009-04-07   |
|2009-04-07 06:20:00|2009-04-07   |
|2009-04-07 06:20:03|2009-04-07   |
|2009-04-07 06:20:03|2009-04-07   |
|2009-04-07 06:20:05|2009-04-07   |
|2009-04-07 06:20:09|2009-04-07   |
|2009-04-07 06:20:16|2009-04-07   |
|2009-04-07 06:20:17|2009-04-07   |
|2009-04-07 06:20:19|2009-04-07   |
|2009-04-07 06:20:19|2009-04-07   |
|2009-04-07 06:20:20|2009-04-07   |
|2009-04-07 06:20:20|2009-04-07   |
|2009-04-07 06:20:22|2009-04-07   |
|2009-04-07 06:20:25|2009-04-07   |
|2009-04-07 06:20:31|2009-04-07   |
|2009-04-07 06:20:34|2009-04-07   |
+-------------------+-------------+
only showing top 20 rows



                                                                                

# B U N A     -     B I      -    B A K
### EKSIK GUNLERI NASIL BULACAGIM

In [36]:
# Sort ascending via TIMESTAMP column
date_df = date_df.orderBy("TIMESTAMP", ascending=True)

# Show sorted DataFrame
date_df.show(truncate=False)



+----------------------------+---------------------------------------------------------------------------------------------------------------------+-------------------+
|DATE                        |TEXT                                                                                                                 |TIMESTAMP          |
+----------------------------+---------------------------------------------------------------------------------------------------------------------+-------------------+
|Mon Apr 06 22:19:45 PDT 2009|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |2009-04-07 06:19:45|
|Mon Apr 06 22:19:49 PDT 2009|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |2009-04-07 06:19:49|
|Mon Apr 06 22:19:53 PDT 2009|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |2009-04

                                                                                

In [None]:
# Select THE NEWEST DATE
newest_date = date_df.select("TIMESTAMP").first()[0]

# Print THE NEWEST DATE
print("THE NEWEST DATE:", newest_date)

In [None]:
# Select THE OLDEST DATE
oldest_date = date_df.select("TIMESTAMP").collect()[-1][0]

# Print THE OLDEST DATE
print("THE OLDEST DATE:", oldest_date)

# ================================
# MYSQL CONNECTION AND YCSB TEST
# ================================

In [39]:
from pyspark.sql import SparkSession

# Create Spark Session for MySQL
spark = SparkSession.builder \
    .appName("DFToMySQL") \
    .getOrCreate()

# Configure MySQL Database Connection
mysql_options = {
    "url": "jdbc:mysql://localhost:3306/sample",  # MySQL bağlantı URL'si
    "driver": "com.mysql.cj.jdbc.Driver",  # MySQL JDBC sürücüsü
    "dbtable": "yourtable",  # Hedef MySQL tablo adı
    "user": "root",  # MySQL kullanıcı adı
    "password": "password"  # MySQL parola
}

# DataFrame'i MySQL veritabanına yükleyin
df.write.format("jdbc").options(**mysql_options).mode("overwrite").save()

                                                                                

In [43]:
df = spark.read.jdbc(url="jdbc:mysql://localhost:3306/sample", table="yourtable", properties=mysql_options)

In [44]:
df.show()

[Stage 42:>                                                         (0 + 1) / 1]

+---+--------------------+--------------------+
| ID|                DATE|                TEXT|
+---+--------------------+--------------------+
|  0|Mon Apr 06 22:19:...|@switchfoot http:...|
|  1|Mon Apr 06 22:19:...|is upset that he ...|
|  2|Mon Apr 06 22:19:...|@Kenichan I dived...|
|  3|Mon Apr 06 22:19:...|my whole body fee...|
|  4|Mon Apr 06 22:19:...|@nationwideclass ...|
|  5|Mon Apr 06 22:20:...|@Kwesidei not the...|
|  6|Mon Apr 06 22:20:...|         Need a hug |
|  7|Mon Apr 06 22:20:...|@LOLTrish hey  lo...|
|  8|Mon Apr 06 22:20:...|@Tatiana_K nope t...|
|  9|Mon Apr 06 22:20:...|@twittera que me ...|
| 10|Mon Apr 06 22:20:...|spring break in p...|
| 11|Mon Apr 06 22:20:...|I just re-pierced...|
| 12|Mon Apr 06 22:20:...|@caregiving I cou...|
| 13|Mon Apr 06 22:20:...|@octolinz16 It it...|
| 14|Mon Apr 06 22:20:...|@smarrison i woul...|
| 15|Mon Apr 06 22:20:...|@iamjazzyfizzle I...|
| 16|Mon Apr 06 22:20:...|Hollis' death sce...|
| 17|Mon Apr 06 22:20:...|about to file 

                                                                                

In [53]:
# YCSB TEST FOR MYSQL
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run jdbc -P /home/hduser/ycsb-0.17.0/workloads/workloada -p db.url=jdbc:mysql://localhost:3306/tweet_mysql -p db.user=root -p db.passwd=kalem -p db.driver=com.mysql.cj.jdbc.Driver

/usr/bin/java  -classpath /home/hduser/ycsb-0.17.0/conf:/home/hduser/ycsb-0.17.0/lib/HdrHistogram-2.1.4.jar:/home/hduser/ycsb-0.17.0/lib/core-0.17.0.jar:/home/hduser/ycsb-0.17.0/lib/htrace-core4-4.1.0-incubating.jar:/home/hduser/ycsb-0.17.0/lib/jackson-core-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/lib/jackson-mapper-asl-1.9.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/conf:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-collections-3.2.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-lang-2.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/commons-pool-1.5.4.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/geronimo-jms_1.1_spec-1.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/geronimo-jta_1.1_spec-1.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/jdbc-binding-0.17.0.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/mysql-connector-java-8.0.30.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/openjpa-jdbc-2.1.1.jar:/home/hduser/ycsb-0.17.0/jdbc-binding/lib/openjpa-kernel-2.1.1.jar:/home/h

# ===================================
# MONGODB CONNECTION AND YCSB TEST
# ===================================

In [None]:
# YCSB TEST FOR MONGODB
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run mongodb -P /home/hduser/ycsb-0.17.0/workloads/workloada -p mongodb.url=mongodb://localhost:27017 -p mongodb.database=tweet_mongo 

# ==========================
# DEEP LEARNING - RNN MODEL
# ==========================

# ======
## GRAPHS
# ======

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format

# 1-Week Analysis
weekly_data = date_df.groupBy(date_format("TIMESTAMP", "yyyy-ww")).count()
weekly_data = weekly_data.withColumnRenamed("date_format(TIMESTAMP, yyyy-ww)", "week")
weekly_data = weekly_data.orderBy("week", ascending=True)
weekly_data.show()

# Get the result and visualize it
weekly_data_pd = weekly_data.toPandas()

# Plot for 1-Week Time Series
plt.figure(figsize=(12, 6))
plt.bar(weekly_data_pd["week"], weekly_data_pd["count"], width=0.5)
plt.title("Weekly Tweet Count")
plt.xlabel("Week")
plt.ylabel("Tweet Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format

# 1-Month Analysis
monthly_data = date_df.groupBy(date_format("TIMESTAMP", "yyyy-MM")).count()
monthly_data = monthly_data.withColumnRenamed("date_format(TIMESTAMP, yyyy-MM)", "month")
monthly_data.show()

# Get the results and visualize it
monthly_data_pd = monthly_data.toPandas()

# Plot for 1-Month Time Series
plt.figure(figsize=(12, 6))
plt.bar(monthly_data_pd["month"], monthly_data_pd["count"], width=0.5)
plt.title("Monthly Tweet Count")
plt.xlabel("Month")
plt.ylabel("Tweet Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format

# 3-Month Analysis
quarterly_data = date_df.groupBy(date_format("TIMESTAMP", "yyyy-MM")).count()
quarterly_data = quarterly_data.withColumnRenamed("date_format(TIMESTAMP, yyyy-MM)", "quarter")
quarterly_data.show()

# Get the results and visualize it
quarterly_data_pd = quarterly_data.toPandas()

# Plot for 3-Month Time Series
plt.figure(figsize=(12, 6))
plt.bar(quarterly_data_pd["quarter"], quarterly_data_pd["count"], width=0.5)
plt.title("3-Month Tweet Count")
plt.xlabel("Quarter")
plt.ylabel("Tweet Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Boş değerleri kontrol etmek için isNull() kullanın
nan_value = date_df.filter(F.col("TIMESTAMP").isNull())

# Hangi satırlarda boş değerler olduğunu gösterin
print("Boş değerlerin olduğu satırlar:")
nan_value.show()

# Toplam boş değer sayısını alın
nan_value_count = nan_value.count()
print("Toplam boş değer sayısı:", nan_value_count)