In [1]:
sc

### CONNECTING TO MONGODB

In [1]:
!mongo --version

MongoDB shell version: 3.2.10


In [None]:
!mongosh

In [1]:
from pyspark.sql import SparkSession

# Create Spark Session for MongoDB
spark = SparkSession.builder \
        .appName("DFToMongoDB") \
        .getOrCreate()

# # #
data = [("John", 28), ("Alice", 22), ("Bob", 32)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [2]:
df.show(3)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+---+
| Name|Age|
+-----+---+
| John| 28|
|Alice| 22|
|  Bob| 32|
+-----+---+





In [3]:
# Configure MongoDB Database Connection
df.write.format("mongodb") \
    .option("uri","mongodb://127.0.0.1:27017/") \
    .option("database","sample_db") \
    .option("collection","scb") \
    .mode("append").save()

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

### CONNECTING TO MYSQL

In [2]:
!mysql --version

mysql  Ver 8.0.30-0ubuntu0.22.04.1 for Linux on x86_64 ((Ubuntu))


In [5]:
from pyspark.sql import SparkSession

# Create Spark Session for MySQL
spark = SparkSession.builder \
    .appName("DFToMySQL") \
    .getOrCreate()

# # #
data = [("John", 28), ("Alice", 22), ("Bob", 32)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Configure MySQL Database Connection
mysql_options = {
    "url": "jdbc:mysql://localhost:3306/sample",  # MySQL bağlantı URL'si
    "driver": "com.mysql.cj.jdbc.Driver",  # MySQL JDBC sürücüsü
    "dbtable": "yourtable",  # Hedef MySQL tablo adı
    "user": "root",  # MySQL kullanıcı adı
    "password": "password"  # MySQL parola
}

# DataFrame'i MySQL veritabanına yükleyin
df.write.format("jdbc").options(**mysql_options).mode("overwrite").save()

                                                                                

In [3]:
!pwd

/home/hduser/Desktop


In [4]:
!cat zahid.txt

pwd
cd Downloads/
nano zahid.txt
cat zahid.txt 
-----------------------------------------------
mysql -u root -p
Enter password: password
CREATE DATABASE sample;
USE sample;
SHOW TABLES;
-----------------------------------------------
Downloads$ ls mon*
Downloads$ sudo cp mon* /usr/local/spark/jars
Downloads$ sudo cp ./bson-3.12.12.jar /usr/local/spark/jars
Downloads$ ls mys*
Downloads$ sudo cp ./mysql-connector-j-8.0.33.jar /usr/local/spark/jars
--------------------------------------------------------------------------
cd /usr/local/spark/jars
/usr/local/spark/jars$ ls mon*
/usr/local/spark/jars$ ls mysql*
--------------------------------------------------------------------------
mongosh
/mongodb-linux-x86_64-ubuntu1604-3.2.10$ ./bin/mongod
--------------------------------------------------------------------------
https://repo1.maven.org/maven2/org/mongodb/mongodb-driver-sync/3.12.12/
--------------------------------------------------------------------------


### READ TO CSV FROM HDFS VIA SPARK

In [1]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
        .appName("HDFSToCSV") \
        .getOrCreate()

# Specify CSV file path throught HDFS
hdfs_file_path = "/ProjectTweets.csv"

# Read CSV file with Spark DataFrame
df = spark.read.csv(hdfs_file_path, header=False, inferSchema=True)

                                                                                

In [2]:
# Show DataFrame First 5 Rows
df.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [3]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [4]:
# The first method for renamed the column names
df1 = df.withColumnRenamed("_c0", "id").withColumnRenamed("_c1", "timestamp").withColumnRenamed("_c2", "date").withColumnRenamed("_c3", "flag").withColumnRenamed("_c4", "user").withColumnRenamed("_c5", "text")
df1.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
| id| timestamp|                date|    flag|           user|                text|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [5]:
# The second method for renamed the column names
df = df.selectExpr("_c0 as ID", "_c1 as TIMESTAMP", "_c2 as DATE", "_c3 as FLAG", "_c4 as USER", "_c5 as TEXT")
df.show(5)

+---+----------+--------------------+--------+---------------+--------------------+
| ID| TIMESTAMP|                DATE|    FLAG|           USER|                TEXT|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+---+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [6]:
# How many rows does the dataframe 
row_count = df.count()
# Print row_count
print("DataFrame has {} rows.".format(row_count))



DataFrame has 1600000 rows.


                                                                                

In [7]:
from pyspark.sql.functions import col

columns = ["ID", "TIMESTAMP", "DATE", "FLAG", "USER", "TEXT"]

Columns = df.columns

# Check out the each column and Count unique values
for column in Columns:
    unique_values = df.select(column).distinct()
    unique_count = unique_values.count()
    
    if unique_count > 0:
        print(f"{column} has {unique_count} unique values:")
    else:
        print(f"{column} has no unique value.")

                                                                                

ID has 1600000 unique values:


                                                                                

TIMESTAMP has 1598315 unique values:


                                                                                

DATE has 774363 unique values:


                                                                                

FLAG has 1 unique values:


                                                                                

USER has 659775 unique values:




TEXT has 1581466 unique values:


                                                                                

In [8]:
from pyspark.sql.functions import col

columns = ["ID", "TIMESTAMP", "DATE", "FLAG", "USER", "TEXT"]

Columns = df.columns

# Check out the each column and Count duplicate values
for column in Columns:
    count_df = df.groupBy(column).count()
    duplicate_values = count_df.filter(col("count") > 1).count()
    
    if duplicate_values > 0:
        print(f"{column} has {duplicate_values} duplicate values.")
    else:
        print(f"{column} has no duplicate value.")

                                                                                

ID has no duplicate value.


                                                                                

TIMESTAMP has 1685 duplicate values.


                                                                                

DATE has 373151 duplicate values.


                                                                                

FLAG has 1 duplicate values.


                                                                                

USER has 254498 duplicate values.




TEXT has 8434 duplicate values.




In [9]:
# Drop the selected columns
df = df.drop("TIMESTAMP", "FLAG")
df.show(5)

+---+--------------------+---------------+--------------------+
| ID|                DATE|           USER|                TEXT|
+---+--------------------+---------------+--------------------+
|  0|Mon Apr 06 22:19:...|_TheSpecialOne_|@switchfoot http:...|
|  1|Mon Apr 06 22:19:...|  scotthamilton|is upset that he ...|
|  2|Mon Apr 06 22:19:...|       mattycus|@Kenichan I dived...|
|  3|Mon Apr 06 22:19:...|        ElleCTF|my whole body fee...|
|  4|Mon Apr 06 22:19:...|         Karoli|@nationwideclass ...|
+---+--------------------+---------------+--------------------+
only showing top 5 rows



In [10]:
df.describe().show()



+-------+------------------+--------------------+--------------------+--------------------+
|summary|                ID|                DATE|                USER|                TEXT|
+-------+------------------+--------------------+--------------------+--------------------+
|  count|           1600000|             1600000|             1600000|             1600000|
|   mean|          799999.5|                null| 4.325887521835714E9|                null|
| stddev|461880.35968924535|                null|5.162733218454889E10|                null|
|    min|                 0|Fri Apr 17 20:30:...|        000catnap000|                 ...|
|    max|           1599999|Wed May 27 07:27:...|          zzzzeus111|ï¿½ï¿½ï¿½ï¿½ï¿½ß§...|
+-------+------------------+--------------------+--------------------+--------------------+



                                                                                

In [11]:
df.summary().show()



+-------+------------------+--------------------+--------------------+--------------------+
|summary|                ID|                DATE|                USER|                TEXT|
+-------+------------------+--------------------+--------------------+--------------------+
|  count|           1600000|             1600000|             1600000|             1600000|
|   mean|          799999.5|                null| 4.325887521835714E9|                null|
| stddev|461880.35968924535|                null|5.162733218454889E10|                null|
|    min|                 0|Fri Apr 17 20:30:...|        000catnap000|                 ...|
|    25%|            399999|                null|             32508.0|                null|
|    50%|            799999|                null|            130587.0|                null|
|    75%|           1200076|                null|           1100101.0|                null|
|    max|           1599999|Wed May 27 07:27:...|          zzzzeus111|ï¿½ï¿½ï¿½ï

                                                                                

In [12]:
from pyspark.sql.functions import col

# Do a grouping and counting operation to find duplicate values in the "TEXT" column
count_df = df.groupBy("TEXT").count()

# Filter rows containing duplicate values
duplicate_values = count_df.filter(col("count") > 1)

# If there are duplicate values, show them
if duplicate_values.count() > 0:
    print("Duplicate values:")
    duplicate_values.show(truncate=False)  # Display column values in full length
else:
    print("No duplicate values found.")

                                                                                

Duplicate values:




+----------------------------------------------------------------------------------------------------------------------------------------+-----+
|TEXT                                                                                                                                    |count|
+----------------------------------------------------------------------------------------------------------------------------------------+-----+
|is poorly sick                                                                                                                          |4    |
|This little tree is tiiiiired  25's (and dealing with stupid people) tomorrow and then finishing 10s monday! Raiding every nice         |2    |
|at home with a cold                                                                                                                     |2    |
|Hangover.                                                                                                                        

                                                                                

In [14]:
# How many rows does the dataframe 
row_count = df.count()
# Print row_count
print("DataFrame has {} rows.".format(row_count))



DataFrame has 1600000 rows.


                                                                                

In [15]:
df.select("DATE").show(truncate=False)

+----------------------------+
|DATE                        |
+----------------------------+
|Mon Apr 06 22:19:45 PDT 2009|
|Mon Apr 06 22:19:49 PDT 2009|
|Mon Apr 06 22:19:53 PDT 2009|
|Mon Apr 06 22:19:57 PDT 2009|
|Mon Apr 06 22:19:57 PDT 2009|
|Mon Apr 06 22:20:00 PDT 2009|
|Mon Apr 06 22:20:03 PDT 2009|
|Mon Apr 06 22:20:03 PDT 2009|
|Mon Apr 06 22:20:05 PDT 2009|
|Mon Apr 06 22:20:09 PDT 2009|
|Mon Apr 06 22:20:16 PDT 2009|
|Mon Apr 06 22:20:17 PDT 2009|
|Mon Apr 06 22:20:19 PDT 2009|
|Mon Apr 06 22:20:19 PDT 2009|
|Mon Apr 06 22:20:20 PDT 2009|
|Mon Apr 06 22:20:20 PDT 2009|
|Mon Apr 06 22:20:22 PDT 2009|
|Mon Apr 06 22:20:25 PDT 2009|
|Mon Apr 06 22:20:31 PDT 2009|
|Mon Apr 06 22:20:34 PDT 2009|
+----------------------------+
only showing top 20 rows



In [16]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- DATE: string (nullable = true)
 |-- USER: string (nullable = true)
 |-- TEXT: string (nullable = true)



## TEXT PRE-PROCESSING