In [None]:
sc

# ===================================
# MONGODB CONNECTION AND YCSB TEST
# ===================================

In [None]:
!mongo --version

In [None]:
%%time
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("MongoDB").getOrCreate()

# Read CSV file with Spark DataFrame
mongodb_df = spark.read.csv("/ProjectTweets.csv", header=False, inferSchema=True)

In [None]:
# INITILAZE THE MONGODB IN TERMINAL
# !mongod

In [None]:
%%time
# Configure MongoDB Database Connection
mongodb_df.write.format("mongodb") \
                .option("uri","mongodb://127.0.0.1:27017/") \
                .option("database","TweetDatabaseMongoDB") \
                .option("collection","TweetCollectionMongoDB") \
                .mode("append").save()

In [None]:
# Show DataFrame via MongoDB
mongodb_df.show()

In [None]:
%%time
# YCSB TEST FOR MONGODB
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run mongodb -P /home/hduser/ycsb-0.17.0/workloads/workloada -p mongodb.url=mongodb://localhost:27017 -p mongodb.database=TweetDatabaseMongoDB 

# ================================
# MYSQL CONNECTION AND YCSB TEST
# ================================

In [None]:
!mysql --version

In [None]:
%%time
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("MySQL").getOrCreate()

# Read CSV file with Spark DataFrame
mysql_df = spark.read.csv("/ProjectTweets.csv", header=False, inferSchema=True)

In [None]:
# INITILAZE THE MYSQL IN TERMINAL
# !mysql -u root -p

In [None]:
%%time
# Configure MySQL Database Connection
MySQL_Configuration = {
    "url": "jdbc:mysql://localhost:3306/sample",
    "driver": "com.mysql.cj.jdbc.Driver",
    "dbtable": "yourtable",
    "user": "root",
    "password": "password"
}

# Load DataFrame into MySQL database
mysql_df.write.format("jdbc").options(**MySQL_Configuration).mode("overwrite").save()

In [None]:
%%time
# Read DataFrame via MySQL
mysql_df = spark.read.jdbc(url = "jdbc:mysql://localhost:3306/sample",
                           table = "yourtable",
                           properties = MySQL_Configuration)

In [None]:
%%time
# Show DataFrame via MySQL
mysql_df.show()

In [None]:
%%time
# YCSB TEST FOR MYSQL
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run jdbc -P /home/hduser/ycsb-0.17.0/workloads/workloada -p db.url=jdbc:mysql://localhost:3306/sample -p db.user=root -p db.passwd=password -p db.driver=com.mysql.cj.jdbc.Driver

# ================================
# READ TO CSV FROM HDFS VIA SPARK
# ================================

In [None]:
%%time
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("HDFSToCSV").getOrCreate()

# Read CSV file with Spark DataFrame
df = spark.read.csv("/ProjectTweets.csv", header=False, inferSchema=True)

In [None]:
# Show DataFrame First 5 Rows
df.show(5)

In [None]:
df.printSchema()

In [None]:
# The first method for renamed the column names
# df = df.withColumnRenamed("_c0", "id").withColumnRenamed("_c1", "stamp").withColumnRenamed("_c2", "date").withColumnRenamed("_c3", "flag").withColumnRenamed("_c4", "user").withColumnRenamed("_c5", "text")

# The second method for renamed the column names
df = df.selectExpr("_c0 as ID", "_c1 as STAMP", "_c2 as DATE", "_c3 as FLAG", "_c4 as USER", "_c5 as TEXT")
df.show(5)

In [None]:
# How many rows does the dataframe 
row_count = df.count()
# Print row_count
print("DataFrame has {} rows.".format(row_count))

In [None]:
from pyspark.sql.functions import col

columns = ["ID", "STAMP", "DATE", "FLAG", "USER", "TEXT"]

Columns = df.columns

# Check out the each column and Count unique values
for column in Columns:
    unique_values = df.select(column).distinct()
    unique_count = unique_values.count()
    
    if unique_count > 0:
        print(f"{column} has {unique_count} unique values:")
    else:
        print(f"{column} has no unique value.")

In [None]:
from pyspark.sql.functions import col

columns = ["ID", "STAMP", "DATE", "FLAG", "USER", "TEXT"]

Columns = df.columns

# Check out the each column and Count duplicate values
for column in Columns:
    count_df = df.groupBy(column).count()
    duplicate_values = count_df.filter(col("count") > 1).count()
    
    if duplicate_values > 0:
        print(f"{column} has {duplicate_values} duplicate values.")
    else:
        print(f"{column} has no duplicate value.")

In [None]:
# Drop the selected columns
df = df.drop("ID, ""STAMP", "FLAG", "USER")
df.show(5)

In [None]:
df.describe().show()

In [None]:
df.summary().show()

In [None]:
from pyspark.sql.functions import col

# Do a grouping and counting operation to find duplicate values in the "TEXT" column
count_df = df.groupBy("TEXT").count()

# Filter rows containing duplicate values
duplicate_values = count_df.filter(col("count") > 1)

# If there are duplicate values, show them
if duplicate_values.count() > 0:
    print("Duplicate values:")
    duplicate_values.show(truncate=False)  # Display column values in full length
else:
    print("No duplicate values found.")

In [None]:
# How many rows does the dataframe 
row_count = df.count()
# Print row_count
print("DataFrame has {} rows.".format(row_count))

In [None]:
df.printSchema()

# =====================
# TEXT PRE-PROCESSING
# =====================

In [None]:
from pyspark.sql.functions import col, udf, lower, regexp_replace
from pyspark.sql.types import ArrayType, StringType, FloatType

import string
import nltk
import re
import contractions
import torch

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer 

from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import word_tokenize

In [None]:
df.select("TEXT").show(3, truncate=False)

#### TEXT CLEANING

In [None]:
import re

# Cleaning Function
def clean_text(text):
    text = text.strip()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s\s+', ' ', text)
    text = re.sub(r'[@_!#$%^&*()<>?/\|}{~:]', '', text)
    text = text.lower()
    return text

# Save as UDF
clean_text_udf = udf(clean_text, StringType())

# Create new column
df = df.withColumn("TEXT_C1", clean_text_udf(col("text")))
df.select("TEXT", "TEXT_C1").show(3, truncate=False)

#### EXPAND CONTRACTIONS

In [None]:
import contractions

# Function to expand contractions
def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

# Save as UDF
expand_contractions_udf = udf(expand_contractions, StringType())

# Use the UDF and Create new column
df = df.withColumn("TEXT_C2", expand_contractions_udf(col("TEXT_C1")))

# Show the dataframe
df.select("TEXT", "TEXT_C1", "TEXT_C2").show(3, truncate=False)

#### CLEAN THE PUNCTUATION CHARACTERS

In [None]:
# Define punctuation characters
punctuation_characters = r'[!\"#\$%&\'\(\)\*\+,\-./:;<=>\?@[\\]\^_`{|}~]'

# Remove punctuation characters
df = df.withColumn("TEXT_C3", regexp_replace(col("TEXT_C2"), punctuation_characters, ""))

# Show the dataframe
df.select("TEXT", "TEXT_C1", "TEXT_C2", "TEXT_C3").show(3, truncate=False)

#### CLEAN THE STOPWORDS

In [None]:
# Download "stopwords" from nltk dictionary
nltk.download("stopwords")

# Configure the language as english
stop_words = set(stopwords.words("english"))

# Define the udf 
remove_stopwords_udf = udf(lambda text: " ".join([word for word in text.split() if word not in stop_words]), StringType())

# Use the UDF in order to remove stopwords and Create new column
df = df.withColumn("TEXT_C4", remove_stopwords_udf(col("TEXT_C3")))

# Show the dataframe
df.select("TEXT", "TEXT_C1", "TEXT_C2", "TEXT_C3", "TEXT_C4").show(3, truncate=False)

#### IMPLEMENT LEMMATIZATION

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download 'punkt','averaged_perceptron_tagger','wordnet' from nltk dictionary
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Defining the function that implements the Lemmatization operation as a UDF
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    words = word_tokenize(text)
    for w in words:
        lemma = lemmatizer.lemmatize(w)
        lemmatized_sentence.append(lemma)
    lemmatized_text = " ".join(lemmatized_sentence)
    return lemmatized_text

# Define the UDF
lemmatize_text_udf = udf(lemmatize_text, StringType())

# Use the UDF and Create new column
df = df.withColumn("TEXT_C5", lemmatize_text_udf(df["TEXT_C4"]))

# Show the dataframe
df.select("TEXT", "TEXT_C1", "TEXT_C2", "TEXT_C3", "TEXT_C4", "TEXT_C5").show(3, truncate=False)

#### IMPLEMENT STEMMING

In [None]:
from nltk.stem import SnowballStemmer

# Defining the function that finds word roots as UDF 
def stem_text(text):
    snow = SnowballStemmer('english')
    stemmed_sentence = []
    words = word_tokenize(text)
    for w in words:
        stemmed_sentence.append(snow.stem(w))
    stemmed_text = " ".join(stemmed_sentence)
    return stemmed_text

# Define the UDF
stem_text_udf = udf(stem_text, StringType())

# Use the UDF and Create new column
df = df.withColumn("TEXT_C6", stem_text_udf(df["TEXT_C5"]))

# Show the dataframe
df.select("TEXT", "TEXT_C1", "TEXT_C2", "TEXT_C3", "TEXT_C4", "TEXT_C5", "TEXT_C6").show(3, truncate=False)

#### IMPLEMENT TOKENIZATION

In [None]:
from pyspark.sql.types import ArrayType, StringType
from nltk.tokenize import word_tokenize

# Function that splits text into tokens using NLTK
def tokenize_text(text):
    words = word_tokenize(text)
    return words

# Define the UDF
tokenize_text_udf = udf(tokenize_text, ArrayType(StringType()))

# Use the UDF and Create new column
df = df.withColumn("TOKENS", tokenize_text_udf(df["TEXT_C6"]))

# Show the selected dataframe
df.select("TEXT", "TEXT_C6", "TOKENS").show(3, truncate=False)

#### DROP THE REDUNDANT COLUMNS

In [None]:
df.show(5)

In [None]:
# Drop the selected columns
df = df.drop("TEXT_C1", "TEXT_C2", "TEXT_C3", "TEXT_C4", "TEXT_C5")
df.show(5)

#### IMPLEMENT TOKENIZATION AND SPLIT WORDS TO ROWS

In [None]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import explode, col

# Create The Tokenizer
tokenizer = Tokenizer(inputCol="TEXT_C6", outputCol="words")
tokenizer_df = tokenizer.transform(df)

# Separate words into individual lines
tokenizer_df = tokenizer_df.select(explode(col("words")).alias("word"))

# Show the dataframe
tokenizer_df.show(50, truncate=False)

#### COUNT THE TOKENIZER WORDS

In [None]:
%%time
# Count the tokenizer words
tokenizer_df_count = tokenizer_df.groupBy("word").count().orderBy("count", ascending=False)

# Show the dataframe
tokenizer_df_count.show(50)

## ==============
## Text Blob Method
## ==============
#### SENTIMENT SCORES AND SENTIMENT LABEL ( POSITIVE - NEGATIVE - NEUTRAL )

In [None]:
%%time

from textblob import TextBlob

# Sentimental Analysis Function
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Sentiment Label Function
def label_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

#=========================================================================================#

from pyspark.sql.types import FloatType, StringType

# Define the UDF with Functions
sentiment_udf = udf(get_sentiment, FloatType())
label_udf = udf(label_sentiment, StringType())

# Use the UDF and Create new columns
df = df.withColumn('SentimentScores', sentiment_udf(df['TEXT_C6']))
df = df.withColumn('SentimentLabels', label_udf(df['SentimentScores']))

# Count and Show the 'sentiment_label' column
df.groupBy('SentimentLabels').count().show()

## ==================
## Vader Lexicon Method
## ==================
#### SENTIMENT SCORES

In [None]:
%%time

from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
nltk.download('punkt')

# Define a function for sentiment analysis that filters sentiment words
def filter_sentiment_words(text):
    # Create SentimentIntensityAnalyzer object
    sia = SentimentIntensityAnalyzer()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Select words that express sentiment
    filtered_words = [word for word in words if sia.polarity_scores(word)['compound'] != 0]
    return filtered_words

# Define a UDF for Spark
filter_sentiment_udf = udf(filter_sentiment_words, ArrayType(StringType()))

# Define a UDF to get the sentiment score
sentiment_score_udf = udf(lambda text: SentimentIntensityAnalyzer().polarity_scores(text)['compound'], FloatType())

# Apply the UDFs to the 'text' column
df = df.withColumn("SentimentWords", filter_sentiment_udf(col("TEXT_C6")))
df = df.withColumn("NewSentimentScores", sentiment_score_udf(col("TEXT_C6")))

# Select the necessary columns and rename the 'sentiment_words' column to 'text'
df.select("DATE", "SentimentWords", "NewSentimentScores").show(truncate=False)

In [None]:
%%time

# Sentiment Label Function
def new_label_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

# Define the UDF with Function
new_label_udf = udf(new_label_sentiment, StringType())

# Use the UDF and Create new column
df = df.withColumn('NewSentimentLabels', new_label_udf(df['NewSentimentScores']))

# Count and Show the 'NewSentimentLabels' column
df.groupBy('NewSentimentLabels').count().show()

## Sentiment Score Comparison of TextBlob and Vader Methods 

In [None]:
df.select("TEXT", "SentimentScores", "NewSentimentScores").show(50, truncate=False)

#### DROP THE REDUNDANT COLUMNS

In [None]:
df.show(1)

In [None]:
# Drop the selected columns
df = df.drop("TEXT_C1", "TEXT_C2", "TEXT_C3", "TEXT_C4", "TEXT_C5")
df.show(5)

#### POSITIVE AND NEGATIVE WORDS

In [None]:
from wordcloud import WordCloud

plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.NewSentimentLabels == 'positive'].TEXT_C6))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.NewSentimentLabels == 'negative'].TEXT_C6))
plt.imshow(wc , interpolation = 'bilinear')

#### The Positive, Neutral and Negative Rate

In [None]:
%%time
# Count the whole text 
total_count = df.count()
# Count the positive sentiment label
positive_count = df.filter(df.NewSentimentLabels == "positive").count()
# Calculate the positive rate
positive_rate = (positive_count / total_count) * 100
# Print the positive rate
print(f"Positive count: {positive_count}")
print(f"Positive rate : {positive_rate}%")

In [None]:
%%time
# Count the whole text 
total_count = df.count()
# Count the neutral sentiment label
neutral_count = df.filter(df.NewSentimentLabels == "neutral").count()
# Calculate the neutral rate
neutral_rate = (neutral_count / total_count) * 100
# Print the neutral rate
print(f"Neutral count: {neutral_count}")
print(f"Neutral rate : {neutral_rate}%")

In [None]:
%%time
# Count the whole text 
total_count = df.count()
# Count the negative sentiment label
negative_count = df.filter(df.NewSentimentLabels == "negative").count()
# Calculate the negative rate
negative_rate = (negative_count / total_count) * 100
# Print the negative rate
print(f"Negative count: {negative_count}")
print(f"Negative rate : {negative_rate}%")

In [None]:
df.show(1)

# ===============================================
# TIMESTAMP PREPARATION FOR TIME SERIES ANALYSIS
# ================================================

In [None]:
# Show the schema of the dataframe
df.printSchema()

In [None]:
# Show the Selected Columns
df.select("DATE", "TEXT").show(truncate=False)

In [None]:
# Update Dataframe
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

from pyspark.sql.functions import col, to_timestamp

# Convert to 'yyyy-MM-dd HH:mm:ss' format
df = df.withColumn("TIMESTAMP", to_timestamp(col("DATE"), "EEE MMM dd HH:mm:ss zzz yyyy"))

# Show the selected columns
df.select("DATE", "TIMESTAMP").show(truncate=False)

In [None]:
# COnvert to 'yyyy-MM-dd' format
df = df.withColumn("YearMonthDate", col("TIMESTAMP").substr(1, 10))

# Show the selected columns
df.select("TIMESTAMP", "YearMonthDate").show(truncate=False)

In [None]:
# Sort ascending via TIMESTAMP column
df = df.orderBy("TIMESTAMP", ascending=True)

# Show sorted DataFrame
df.show(truncate=False)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DateType
from datetime import datetime, timedelta

# YearMonthDate sütununu DateType'a çevirme
date_df = date_df.withColumn("YearMonthDateTYPE", col("YearMonthDate").cast(DateType()))

# "THE OLDEST DATE" ve "THE NEWEST DATE" bulma
oldest_date = date_df.agg({"YearMonthDateTYPE": "min"}).collect()[0][0]
newest_date = date_df.agg({"YearMonthDateTYPE": "max"}).collect()[0][0]

print("THE OLDEST DATE:", oldest_date)
print("THE NEWEST DATE:", newest_date)

# B U N A     -     B I      -    B A K
### EKSIK GUNLERI NASIL BULACAGIM

# ================================
# MYSQL CONNECTION AND YCSB TEST
# ================================

In [None]:
# YCSB TEST FOR MYSQL
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run jdbc -P /home/hduser/ycsb-0.17.0/workloads/workloada -p db.url=jdbc:mysql://localhost:3306/tweet_mysql -p db.user=root -p db.passwd=kalem -p db.driver=com.mysql.cj.jdbc.Driver

# ===================================
# MONGODB CONNECTION AND YCSB TEST
# ===================================

In [None]:
# YCSB TEST FOR MONGODB
!/home/hduser/ycsb-0.17.0/bin/ycsb.sh run mongodb -P /home/hduser/ycsb-0.17.0/workloads/workloada -p mongodb.url=mongodb://localhost:27017 -p mongodb.database=tweet_mongo 

# ==========================
# DEEP LEARNING - RNN MODEL
# ==========================

# ======
## GRAPHS
# ======

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format

# 1-Week Analysis
weekly_data = date_df.groupBy(date_format("TIMESTAMP", "yyyy-ww")).count()
weekly_data = weekly_data.withColumnRenamed("date_format(TIMESTAMP, yyyy-ww)", "week")
weekly_data = weekly_data.orderBy("week", ascending=True)
weekly_data.show()

# Get the result and visualize it
weekly_data_pd = weekly_data.toPandas()

# Plot for 1-Week Time Series
plt.figure(figsize=(12, 6))
plt.bar(weekly_data_pd["week"], weekly_data_pd["count"], width=0.5)
plt.title("Weekly Tweet Count")
plt.xlabel("Week")
plt.ylabel("Tweet Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format

# 1-Month Analysis
monthly_data = date_df.groupBy(date_format("TIMESTAMP", "yyyy-MM")).count()
monthly_data = monthly_data.withColumnRenamed("date_format(TIMESTAMP, yyyy-MM)", "month")
monthly_data.show()

# Get the results and visualize it
monthly_data_pd = monthly_data.toPandas()

# Plot for 1-Month Time Series
plt.figure(figsize=(12, 6))
plt.bar(monthly_data_pd["month"], monthly_data_pd["count"], width=0.5)
plt.title("Monthly Tweet Count")
plt.xlabel("Month")
plt.ylabel("Tweet Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format

# 3-Month Analysis
quarterly_data = date_df.groupBy(date_format("TIMESTAMP", "yyyy-MM")).count()
quarterly_data = quarterly_data.withColumnRenamed("date_format(TIMESTAMP, yyyy-MM)", "quarter")
quarterly_data.show()

# Get the results and visualize it
quarterly_data_pd = quarterly_data.toPandas()

# Plot for 3-Month Time Series
plt.figure(figsize=(12, 6))
plt.bar(quarterly_data_pd["quarter"], quarterly_data_pd["count"], width=0.5)
plt.title("3-Month Tweet Count")
plt.xlabel("Quarter")
plt.ylabel("Tweet Count")
plt.xticks(rotation=45)
plt.show()