Get output.txt (1000 lines sample file): 

In [138]:
f = open("englishtweets.csv")

In [139]:
content = ""
for i in range(1000):
    content += f.readline()

In [140]:
f.close()

In [141]:
input_filename = "englishtweets.csv"  # Replace with your input file name
output_filename = "output.txt" # Replace with your desired output file name
lines_to_read = 1000

try:
    with open(input_filename, 'r') as infile, open(output_filename, 'w+') as outfile:
        for i, line in enumerate(infile):
            if i < lines_to_read:
                outfile.write(line)
            else:
                break # Stop reading after 1000 lines
    print(f"Successfully read the first {lines_to_read} lines from '{input_filename}' and wrote them to '{output_filename}'.")
except FileNotFoundError:
    print(f"Error: The file '{input_filename}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully read the first 1000 lines from 'englishtweets.csv' and wrote them to 'output.txt'.


Cleaning CSV content: 

In [142]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("TwitterCSV") \
    .getOrCreate()

In [143]:
path = "/storage/home/eml6069/work/DS410/DS410_Final/output.txt"  # or .csv, name doesn't matter

In [144]:
df = spark.read \
    .option("header", "true") \
    .option("sep", ";") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", "true") \
    .option("mode", "PERMISSIVE") \
    .csv(path)

In [145]:
# df.printSchema()
# df.show(5, truncate=False)
df.columns

['origen',
 'date',
 'username',
 'user_fullname',
 'user_description',
 'user_created',
 'user_verified',
 'location',
 'n_followers',
 'n_following',
 'user_favourites',
 'n_replies',
 'n_likes',
 'n_retweets',
 'hashtags',
 'url',
 'source',
 'is_retweet',
 'text',
 'tweet_language']

In [146]:
clean_df = df.withColumn(
    "text",
    regexp_replace(
        regexp_replace(
            regexp_replace(col("text"), r'[\r\n]', ' '),  # replace newlines with space
            r'[\\\/]', ' '                                # replace backslashes and forward slashes with space
        ),
        r'"', '""'                                      # escape double quotes
    )
)

In [147]:
(clean_df.write
   .mode("overwrite")
   .option("header", True)
   .option("quoteAll", True)      # wrap everything in quotes
   .option("escape", '"')         # escape quotes inside text
   .option("multiLine", True)     # handle long text with internal newlines
   .csv("output_folder"))

In [148]:
clean_df = clean_df.select('origen','date','username','user_fullname','n_replies','n_likes','n_retweets','text','tweet_language')

In [149]:
clean_df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("output_folder")

In [150]:
clean_df.show(5)

+------+--------------------+-------------+-------------------+---------+-------+----------+--------------------+--------------+
|origen|                date|     username|      user_fullname|n_replies|n_likes|n_retweets|                text|tweet_language|
+------+--------------------+-------------+-------------------+---------+-------+----------+--------------------+--------------+
|   df1|2019-05-27 11:49:...|    bitcointe|          Bitcointe|        0|      0|         0|Cardano: Digitize...|            en|
|   df1|2019-05-27 11:49:...|    3eyedbran|Bran - 3 Eyed Raven|        0|      2|         1|Another Test twee...|            en|
|   df1|2019-05-27 11:49:...|DetroitCrypto|        J. Scardina|        0|      0|         0|Current Crypto Pr...|            en|
|   df1|2019-05-27 11:49:...| mmursaleen72| Muhammad Mursaleen|        0|      0|         0|Spiv (Nosar Baz):...|            en|
|   df1|2019-05-27 11:49:...| evilrobotted|       evilrobotted|        0|      0|         0|@nwoo