In [1]:
import sqlite3
from datetime import datetime
import re
import string

directory = '/mnt/processed/private/msds-pt2025a/lt6'
db_name = f'{directory}/database/lab1.db'

In [3]:
def remove_spaces_punctuation(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    return text.strip()

# Connect to your SQLite database
conn = sqlite3.connect(db_name)
conn.create_function("remove_spaces_punctuation", 1, remove_spaces_punctuation)
# Update the text in your_table_name
cur = conn.cursor()

# Remove deleted posts
cur.execute("DELETE FROM reddit_submissions WHERE title = '[deleted by user]'")
conn.commit()

cur.execute("UPDATE reddit_submissions SET selftext = remove_spaces_punctuation(selftext)")
conn.commit()

cur.execute("UPDATE reddit_submissions SET title = remove_spaces_punctuation(title)")
conn.commit()


# Close the cursor and connection
cur.close()
conn.close()

In [12]:
def remove_punctuation(text):
    # This will remove all punctuation from the text
    return re.sub(r'['+string.punctuation+']', '', text)

In [26]:
def count_reddit_submissions_keyword(db_path, keyword, month, year):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    keyword = remove_punctuation(keyword)
    
    # Prepare the SQL statement to count the occurrences of each keyword in the title or selftext,
    # filtered by the specified month and year
    # Assume that there is a 'created_utc' column in 'reddit_submissions' storing Unix timestamp
    sql = """
    SELECT
        SUM(
            (title LIKE ? OR selftext LIKE ?)
        )
    FROM reddit_submissions
    WHERE strftime('%m', created_utc) = ?
      AND strftime('%Y', created_utc) = ?
    """

    # Format the month and year for SQL
    month_str = f"{month:02d}"  # Ensure month is two digits
    year_str = str(year)
    
    # Construct the search patterns
    keyword_pattern = f'%{keyword}%'
    
    # Execute the query with parameters
    cur.execute(sql, (
        keyword_pattern, keyword_pattern,
        month_str, year_str
    ))
    
    results = cur.fetchall()
    
    # Fetch the result
    total_count = results[0]
    
    # Close the cursor and connection
    cur.close()
    conn.close()
    
    return total_count

In [31]:
count = count_reddit_submissions_keyword(db_name, 'Taylor Swift', 7, 2020)
count

(119,)

In [32]:
keyword = 'discog'
counts_dict = {}

for year in range(2020, 2023):  # 2020 to 2022
    for month in range(1, 13):  # 1 to 12 for each month
        if year == 2022 and month > 8:
            break  # Skip months after August 2022
        count = count_reddit_submissions_keyword(db_name, keyword, month, year)
        counts_dict[f"{year}-{month:02d}"] = count

print(counts_dict)

{'2020-01': (73,), '2020-02': (45,), '2020-03': (59,), '2020-04': (86,), '2020-05': (88,), '2020-06': (70,), '2020-07': (75,), '2020-08': (84,), '2020-09': (76,), '2020-10': (72,), '2020-11': (88,), '2020-12': (68,), '2021-01': (78,), '2021-02': (82,), '2021-03': (66,), '2021-04': (57,), '2021-05': (68,), '2021-06': (47,), '2021-07': (46,), '2021-08': (71,), '2021-09': (78,), '2021-10': (51,), '2021-11': (60,), '2021-12': (73,), '2022-01': (87,), '2022-02': (72,), '2022-03': (64,), '2022-04': (84,), '2022-05': (83,), '2022-06': (66,), '2022-07': (71,), '2022-08': (84,)}


In [25]:
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

#cursor.execute('SELECT count(*) FROM reddit_submissions')
cursor.execute("SELECT count(*) FROM reddit_submissions")

results = cursor.fetchall()
conn.close()
results

[(2398316,)]

In [5]:
# Path to your SQLite database
db_path = db_name

# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create a new table with unique rows based on 'title' and 'selftext'
cursor.execute('''
CREATE TABLE IF NOT EXISTS reddit_submissions_unique AS
SELECT * FROM reddit_submissions
WHERE id IN (
    SELECT MIN(id) FROM reddit_submissions
    GROUP BY title, selftext
);
''')

# Delete the original table
cursor.execute('DROP TABLE reddit_submissions;')

# Rename the unique table to the original table's name
cursor.execute('ALTER TABLE reddit_submissions_unique RENAME TO reddit_submissions;')

# Commit changes and close connection
conn.commit()
conn.close()

print('Duplicate rows removed successfully.')


Duplicate rows removed successfully.


In [9]:
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

#cursor.execute('SELECT count(*) FROM reddit_submissions')
cursor.execute("""SELECT MIN(id), title, selftext FROM reddit_submissions
    GROUP BY title, selftext LIMIT 10;""")

results = cursor.fetchall()
conn.close()
results

[('ektbvw', '', ''),
 ('jdarss', '', 'Bruh I just wanna go see Hamilton with my soulmate'),
 ('kiwnqp',
  '',
  'Can anyone help me make my Arca wiki on fandom i really stan Arca so i just thought of making her a wiki httpsarca1000000fandomcomwikiAcraWikihttpsarca1000000fandomcomwikiAcraWiki'),
 ('qhuuop',
  '',
  'Could someone help me to find a song in which the video was about children who fell asleep and watched TV and in this TV there was a man in a black suit later the caretakers of these children came and told them to go to sleep and when the children went to sleep this man came from the TV and he began to sing the song itself is older and not the best quality in the video'),
 ('gdk2zt', '', 'Does anybody have split up'),
 ('v3wvd3', '', 'Does anyone have a linkdownload for plus minus'),
 ('kum8lx',
  '',
  'Download Link httpsplaygooglecomstoreappsdetailsidcommbucksiziplaymediaampreferrer81082 Use Refer Code For Extra Coins 81082 👨\u200d👨\u200d👦 Share with your friends and fami

In [3]:
import sqlite3

directory = '/mnt/processed/private/msds-pt2025a/lt6'
db_path = f'{directory}/database/lab1-backup7.db'

# Connect to the source database
source_conn = sqlite3.connect(f'{directory}/database/2022.db')
source_cursor = source_conn.cursor()

# Connect to the destination database
dest_conn = sqlite3.connect(db_path)
dest_cursor = dest_conn.cursor()

dest_cursor.execute("DROP INDEX idx_file_year_month")
dest_cursor.execute("DROP INDEX idx_year")

batch_size = 10000
select_query = "SELECT main_release, year, file_year, file_month, artist_name, title, genre FROM masters"
insert_query = "INSERT OR REPLACE INTO masters(main_release, year, file_year, file_month, artist_name, title, genre) VALUES (?, ?, ?, ?, ?, ?, ?)"

source_cursor.execute(select_query)

while True:
    rows = source_cursor.fetchmany(batch_size)
    if not rows:
        break
    dest_cursor.executemany(insert_query, rows)
    dest_conn.commit()

dest_cursor.execute("CREATE INDEX idx_file_year_month ON masters(file_year, file_month)")
dest_cursor.execute("CREATE INDEX idx_year ON masters(year)")
dest_conn.commit()

# Close the cursors and connections if necessary
source_cursor.close()
dest_cursor.close()

# Close the connections
source_conn.close()
dest_conn.close()

In [6]:
import sqlite3

# Connect to the SQLite database
directory = '/mnt/processed/private/msds-pt2025a/lt6'
conn = sqlite3.connect(f'{directory}/database/072021a.db')  # Replace 'fresh_discogs.db' with your database file path
cursor = conn.cursor()

# Execute an SQL query to retrieve the row count from the "masters" table
cursor.execute("SELECT COUNT(*) FROM masters ")

# Fetch and print the row count
row_count = cursor.fetchone()[0]
print("Total Rows in 'masters' table:", row_count)

# Execute another SQL query to retrieve the first 5 rows from the "masters" table
cursor.execute("SELECT * FROM masters WHERE file_year = 2021 LIMIT 5")

# Fetch and print the first 5 rows
print("\nFirst 5 Rows in 'masters' table:")
columns = [description[0] for description in cursor.description]
print("Columns:", columns)
results = cursor.fetchall()
for row in results:
    print(row)

# Close the cursor and connection
cursor.close()
conn.close()

Total Rows in 'masters' table: 18451

First 5 Rows in 'masters' table:
Columns: ['id', 'main_release', 'year', 'file_year', 'file_month', 'artist_name', 'title', 'genre']
(2651089, None, 2021, 2021, 7, 'Totalitär', 'Sin Egen Motstandare', 'Rock')
(3348683, None, 2021, 2021, 7, 'Akira Umeda', 'Passagem', 'Electronic')
(3349563, None, 2021, 2021, 7, 'Akira Umeda', 'Surto', 'Electronic')
(3349657, None, 2021, 2021, 7, 'Akira Umeda', 'Música Para O Chá', 'Electronic')
(3382850, None, 2021, 2021, 7, 'Akira Umeda', 'Mbiẽta', 'Electronic')
